update deepspeed docker (#37371)

* update

* create docker image

* 03

* uninstall pytest as it conflits with transformers

* wrong one

* better

* see which package depends on pytest

* up

* resintall

* fix

* deepspeedddddddd

* deepspeedddddddd

* deepspeedddddddd

* deepspeedddddddd

* deepspeedddddddd

* deepspeedddddddd

* deepspeedddddddd

* deepspeedddddddd

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Marc Sun 2025-04-09 14:54:06 +02:00 committed by GitHub
parent e3eda6d188
commit 7ae0be722e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 29 additions and 27 deletions

View File

@ -63,14 +63,14 @@ jobs:
uses: huggingface/hf-workflows/.github/actions/post-slack@main uses: huggingface/hf-workflows/.github/actions/post-slack@main
with: with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build
status: ${{ job.status }} status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-torch-deepspeed-docker: latest-torch-deepspeed-docker:
name: "Latest PyTorch + DeepSpeed" name: "Latest PyTorch + DeepSpeed"
runs-on: runs-on:
group: aws-general-8-plus group: aws-g4dn-2xlarge-cache
steps: steps:
- -
name: Set up Docker Buildx name: Set up Docker Buildx
@ -99,7 +99,7 @@ jobs:
uses: huggingface/hf-workflows/.github/actions/post-slack@main uses: huggingface/hf-workflows/.github/actions/post-slack@main
with: with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}} slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}}
title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build
status: ${{ job.status }} status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
@ -140,7 +140,7 @@ jobs:
uses: huggingface/hf-workflows/.github/actions/post-slack@main uses: huggingface/hf-workflows/.github/actions/post-slack@main
with: with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build
status: ${{ job.status }} status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
@ -176,7 +176,7 @@ jobs:
uses: huggingface/hf-workflows/.github/actions/post-slack@main uses: huggingface/hf-workflows/.github/actions/post-slack@main
with: with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-doc-builder docker build title: 🤗 Results of the huggingface/transformers-doc-builder docker build
status: ${{ job.status }} status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
@ -214,7 +214,7 @@ jobs:
uses: huggingface/hf-workflows/.github/actions/post-slack@main uses: huggingface/hf-workflows/.github/actions/post-slack@main
with: with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build
status: ${{ job.status }} status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
@ -223,19 +223,19 @@ jobs:
runs-on: runs-on:
group: aws-general-8-plus group: aws-general-8-plus
steps: steps:
- -
name: Set up Docker Buildx name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3 uses: docker/setup-buildx-action@v3
- -
name: Check out code name: Check out code
uses: actions/checkout@v4 uses: actions/checkout@v4
- -
name: Login to DockerHub name: Login to DockerHub
uses: docker/login-action@v3 uses: docker/login-action@v3
with: with:
username: ${{ secrets.DOCKERHUB_USERNAME }} username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }} password: ${{ secrets.DOCKERHUB_PASSWORD }}
- -
name: Build and push name: Build and push
uses: docker/build-push-action@v5 uses: docker/build-push-action@v5
with: with:
@ -263,7 +263,7 @@ jobs:
uses: huggingface/hf-workflows/.github/actions/post-slack@main uses: huggingface/hf-workflows/.github/actions/post-slack@main
with: with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build
status: ${{ job.status }} status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
@ -301,7 +301,7 @@ jobs:
uses: huggingface/hf-workflows/.github/actions/post-slack@main uses: huggingface/hf-workflows/.github/actions/post-slack@main
with: with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build
status: ${{ job.status }} status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
@ -310,19 +310,19 @@ jobs:
runs-on: runs-on:
group: aws-general-8-plus group: aws-general-8-plus
steps: steps:
- -
name: Set up Docker Buildx name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3 uses: docker/setup-buildx-action@v3
- -
name: Check out code name: Check out code
uses: actions/checkout@v4 uses: actions/checkout@v4
- -
name: Login to DockerHub name: Login to DockerHub
uses: docker/login-action@v3 uses: docker/login-action@v3
with: with:
username: ${{ secrets.DOCKERHUB_USERNAME }} username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }} password: ${{ secrets.DOCKERHUB_PASSWORD }}
- -
name: Build and push name: Build and push
uses: docker/build-push-action@v5 uses: docker/build-push-action@v5
with: with:
@ -350,7 +350,7 @@ jobs:
uses: huggingface/hf-workflows/.github/actions/post-slack@main uses: huggingface/hf-workflows/.github/actions/post-slack@main
with: with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build
status: ${{ job.status }} status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
@ -388,6 +388,6 @@ jobs:
uses: huggingface/hf-workflows/.github/actions/post-slack@main uses: huggingface/hf-workflows/.github/actions/post-slack@main
with: with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-quantization-latest-gpu build title: 🤗 Results of the transformers-quantization-latest-gpu build
status: ${{ job.status }} status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

View File

@ -42,7 +42,7 @@ jobs:
nightly-torch-deepspeed-docker: nightly-torch-deepspeed-docker:
name: "Nightly PyTorch + DeepSpeed" name: "Nightly PyTorch + DeepSpeed"
runs-on: runs-on:
group: aws-general-8-plus group: aws-g4dn-2xlarge-cache
steps: steps:
- -
name: Set up Docker Buildx name: Set up Docker Buildx

View File

@ -1,12 +1,12 @@
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:23.11-py3 FROM nvcr.io/nvidia/pytorch:24.08-py3
LABEL maintainer="Hugging Face" LABEL maintainer="Hugging Face"
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
ARG PYTORCH='2.2.0' ARG PYTORCH='2.6.0'
# Example: `cu102`, `cu113`, etc. # Example: `cu102`, `cu113`, etc.
ARG CUDA='cu121' ARG CUDA='cu126'
RUN apt -y update RUN apt -y update
RUN apt install -y libaio-dev RUN apt install -y libaio-dev
@ -15,7 +15,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
ARG REF=main ARG REF=main
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] # `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'
# Install latest release PyTorch # Install latest release PyTorch
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)

View File

@ -1,11 +1,11 @@
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
FROM nvcr.io/nvidia/pytorch:23.11-py3 FROM nvcr.io/nvidia/pytorch:24.08-py3
LABEL maintainer="Hugging Face" LABEL maintainer="Hugging Face"
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
# Example: `cu102`, `cu113`, etc. # Example: `cu102`, `cu113`, etc.
ARG CUDA='cu121' ARG CUDA='cu126'
RUN apt -y update RUN apt -y update
RUN apt install -y libaio-dev RUN apt install -y libaio-dev
@ -21,7 +21,8 @@ RUN python3 -m pip uninstall -y torch torchvision torchaudio
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] # `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate