mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 12:50:06 +06:00
update deepspeed docker (#37371)
* update * create docker image * 03 * uninstall pytest as it conflits with transformers * wrong one * better * see which package depends on pytest * up * resintall * fix * deepspeedddddddd * deepspeedddddddd * deepspeedddddddd * deepspeedddddddd * deepspeedddddddd * deepspeedddddddd * deepspeedddddddd * deepspeedddddddd --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
parent
e3eda6d188
commit
7ae0be722e
36
.github/workflows/build-docker-images.yml
vendored
36
.github/workflows/build-docker-images.yml
vendored
@ -63,14 +63,14 @@ jobs:
|
|||||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||||
with:
|
with:
|
||||||
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
||||||
title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build
|
title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build
|
||||||
status: ${{ job.status }}
|
status: ${{ job.status }}
|
||||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||||
|
|
||||||
latest-torch-deepspeed-docker:
|
latest-torch-deepspeed-docker:
|
||||||
name: "Latest PyTorch + DeepSpeed"
|
name: "Latest PyTorch + DeepSpeed"
|
||||||
runs-on:
|
runs-on:
|
||||||
group: aws-general-8-plus
|
group: aws-g4dn-2xlarge-cache
|
||||||
steps:
|
steps:
|
||||||
-
|
-
|
||||||
name: Set up Docker Buildx
|
name: Set up Docker Buildx
|
||||||
@ -99,7 +99,7 @@ jobs:
|
|||||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||||
with:
|
with:
|
||||||
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}}
|
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}}
|
||||||
title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build
|
title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build
|
||||||
status: ${{ job.status }}
|
status: ${{ job.status }}
|
||||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||||
|
|
||||||
@ -140,7 +140,7 @@ jobs:
|
|||||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||||
with:
|
with:
|
||||||
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
||||||
title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build
|
title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build
|
||||||
status: ${{ job.status }}
|
status: ${{ job.status }}
|
||||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||||
|
|
||||||
@ -176,7 +176,7 @@ jobs:
|
|||||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||||
with:
|
with:
|
||||||
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
||||||
title: 🤗 Results of the huggingface/transformers-doc-builder docker build
|
title: 🤗 Results of the huggingface/transformers-doc-builder docker build
|
||||||
status: ${{ job.status }}
|
status: ${{ job.status }}
|
||||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||||
|
|
||||||
@ -214,7 +214,7 @@ jobs:
|
|||||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||||
with:
|
with:
|
||||||
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
||||||
title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build
|
title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build
|
||||||
status: ${{ job.status }}
|
status: ${{ job.status }}
|
||||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||||
|
|
||||||
@ -223,19 +223,19 @@ jobs:
|
|||||||
runs-on:
|
runs-on:
|
||||||
group: aws-general-8-plus
|
group: aws-general-8-plus
|
||||||
steps:
|
steps:
|
||||||
-
|
-
|
||||||
name: Set up Docker Buildx
|
name: Set up Docker Buildx
|
||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v3
|
||||||
-
|
-
|
||||||
name: Check out code
|
name: Check out code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
-
|
-
|
||||||
name: Login to DockerHub
|
name: Login to DockerHub
|
||||||
uses: docker/login-action@v3
|
uses: docker/login-action@v3
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||||
-
|
-
|
||||||
name: Build and push
|
name: Build and push
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v5
|
||||||
with:
|
with:
|
||||||
@ -263,7 +263,7 @@ jobs:
|
|||||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||||
with:
|
with:
|
||||||
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
||||||
title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build
|
title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build
|
||||||
status: ${{ job.status }}
|
status: ${{ job.status }}
|
||||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||||
|
|
||||||
@ -301,7 +301,7 @@ jobs:
|
|||||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||||
with:
|
with:
|
||||||
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
||||||
title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build
|
title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build
|
||||||
status: ${{ job.status }}
|
status: ${{ job.status }}
|
||||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||||
|
|
||||||
@ -310,19 +310,19 @@ jobs:
|
|||||||
runs-on:
|
runs-on:
|
||||||
group: aws-general-8-plus
|
group: aws-general-8-plus
|
||||||
steps:
|
steps:
|
||||||
-
|
-
|
||||||
name: Set up Docker Buildx
|
name: Set up Docker Buildx
|
||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v3
|
||||||
-
|
-
|
||||||
name: Check out code
|
name: Check out code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
-
|
-
|
||||||
name: Login to DockerHub
|
name: Login to DockerHub
|
||||||
uses: docker/login-action@v3
|
uses: docker/login-action@v3
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||||
-
|
-
|
||||||
name: Build and push
|
name: Build and push
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v5
|
||||||
with:
|
with:
|
||||||
@ -350,7 +350,7 @@ jobs:
|
|||||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||||
with:
|
with:
|
||||||
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
||||||
title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build
|
title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build
|
||||||
status: ${{ job.status }}
|
status: ${{ job.status }}
|
||||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||||
|
|
||||||
@ -388,6 +388,6 @@ jobs:
|
|||||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||||
with:
|
with:
|
||||||
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
||||||
title: 🤗 Results of the transformers-quantization-latest-gpu build
|
title: 🤗 Results of the transformers-quantization-latest-gpu build
|
||||||
status: ${{ job.status }}
|
status: ${{ job.status }}
|
||||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||||
|
@ -42,7 +42,7 @@ jobs:
|
|||||||
nightly-torch-deepspeed-docker:
|
nightly-torch-deepspeed-docker:
|
||||||
name: "Nightly PyTorch + DeepSpeed"
|
name: "Nightly PyTorch + DeepSpeed"
|
||||||
runs-on:
|
runs-on:
|
||||||
group: aws-general-8-plus
|
group: aws-g4dn-2xlarge-cache
|
||||||
steps:
|
steps:
|
||||||
-
|
-
|
||||||
name: Set up Docker Buildx
|
name: Set up Docker Buildx
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
|
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
|
||||||
FROM nvcr.io/nvidia/pytorch:23.11-py3
|
FROM nvcr.io/nvidia/pytorch:24.08-py3
|
||||||
LABEL maintainer="Hugging Face"
|
LABEL maintainer="Hugging Face"
|
||||||
|
|
||||||
ARG DEBIAN_FRONTEND=noninteractive
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
ARG PYTORCH='2.2.0'
|
ARG PYTORCH='2.6.0'
|
||||||
# Example: `cu102`, `cu113`, etc.
|
# Example: `cu102`, `cu113`, etc.
|
||||||
ARG CUDA='cu121'
|
ARG CUDA='cu126'
|
||||||
|
|
||||||
RUN apt -y update
|
RUN apt -y update
|
||||||
RUN apt install -y libaio-dev
|
RUN apt install -y libaio-dev
|
||||||
@ -15,7 +15,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
|
|||||||
ARG REF=main
|
ARG REF=main
|
||||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
|
# `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
|
||||||
|
RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'
|
||||||
|
|
||||||
# Install latest release PyTorch
|
# Install latest release PyTorch
|
||||||
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
|
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
|
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
|
||||||
FROM nvcr.io/nvidia/pytorch:23.11-py3
|
FROM nvcr.io/nvidia/pytorch:24.08-py3
|
||||||
LABEL maintainer="Hugging Face"
|
LABEL maintainer="Hugging Face"
|
||||||
|
|
||||||
ARG DEBIAN_FRONTEND=noninteractive
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
# Example: `cu102`, `cu113`, etc.
|
# Example: `cu102`, `cu113`, etc.
|
||||||
ARG CUDA='cu121'
|
ARG CUDA='cu126'
|
||||||
|
|
||||||
RUN apt -y update
|
RUN apt -y update
|
||||||
RUN apt install -y libaio-dev
|
RUN apt install -y libaio-dev
|
||||||
@ -21,7 +21,8 @@ RUN python3 -m pip uninstall -y torch torchvision torchaudio
|
|||||||
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
||||||
RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
|
RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
|
# `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
|
||||||
|
RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user