From 1f2c00d67148b85199dff12e322c1095cd2286f4 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 19 May 2023 20:31:55 +0200 Subject: [PATCH] Fix DeepSpeed stuff in the nightly CI (#23478) fix Co-authored-by: ydshieh --- .github/workflows/build-nightly-ci-docker-images.yml | 10 ++++++++++ .../Dockerfile | 5 ++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-nightly-ci-docker-images.yml b/.github/workflows/build-nightly-ci-docker-images.yml index f13dda7daa8..1b8cab864d9 100644 --- a/.github/workflows/build-nightly-ci-docker-images.yml +++ b/.github/workflows/build-nightly-ci-docker-images.yml @@ -52,6 +52,16 @@ jobs: name: "Nightly PyTorch + DeepSpeed" runs-on: ubuntu-latest steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 diff --git a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile index 50efc08129d..b3ead0c6154 100644 --- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile @@ -1,4 +1,4 @@ -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_22-08.html#rel_22-08 +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-12.html#rel-22-12 FROM nvcr.io/nvidia/pytorch:22.12-py3 LABEL maintainer="Hugging Face" @@ -25,6 +25,9 @@ RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate +# Uninstall `transformer-engine` shipped with the base image +RUN python3 -m pip uninstall -y transformer-engine + # Uninstall `torch-tensorrt` and `apex` shipped with the base image RUN python3 -m pip uninstall -y torch-tensorrt apex