diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index c9b63233d0c..a40a3b3eace 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -205,6 +205,13 @@ jobs: working-directory: /workspace/transformers run: git fetch && git checkout ${{ github.sha }} + # To avoid unknown test failures + - name: Pre build DeepSpeed *again* + working-directory: /workspace/transformers + run: | + python3 -m pip uninstall -y deepspeed + DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + - name: NVIDIA-SMI run: | nvidia-smi @@ -247,6 +254,13 @@ jobs: working-directory: /workspace/transformers run: git fetch && git checkout ${{ github.sha }} + # To avoid unknown test failures + - name: Pre build DeepSpeed *again* + working-directory: /workspace/transformers + run: | + python3 -m pip uninstall -y deepspeed + DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + - name: NVIDIA-SMI run: | nvidia-smi diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index b1f49ee190e..8797287ba43 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -306,6 +306,13 @@ jobs: working-directory: /workspace/transformers run: git fetch && git checkout ${{ github.sha }} + # To avoid unknown test failures + - name: Pre build DeepSpeed *again* + working-directory: /workspace/transformers + run: | + python3 -m pip uninstall -y deepspeed + DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + - name: NVIDIA-SMI run: | nvidia-smi diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile index 3819340ec5a..bd62628989e 100644 --- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile @@ -9,11 +9,18 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip ARG REF=main RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF -RUN python3 -m pip install --no-cache-dir -e ./transformers[deepspeed-testing] # Install latest release PyTorch +# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) +# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) RUN python3 -m pip install --no-cache-dir -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113 +RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] + +# Pre-build DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout) +RUN python3 -m pip uninstall -y deepspeed +RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. RUN cd transformers && python3 setup.py develop