From 5009252a05144f439e76502083c4380c33683054 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 6 Jun 2025 17:59:14 +0200 Subject: [PATCH] Better CI (#38552) better CI Co-authored-by: ydshieh --- .github/workflows/build-docker-images.yml | 40 +---------- .github/workflows/self-scheduled-caller.yml | 12 ---- .github/workflows/self-scheduled.yml | 70 ------------------- docker/transformers-all-latest-gpu/Dockerfile | 2 +- 4 files changed, 2 insertions(+), 122 deletions(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index a51b1f9f154..fe1f18f42b9 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -19,7 +19,7 @@ concurrency: jobs: latest-docker: - name: "Latest PyTorch + TensorFlow [dev]" + name: "Latest PyTorch [dev]" runs-on: group: aws-general-8-plus steps: @@ -267,44 +267,6 @@ jobs: status: ${{ job.status }} slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} - latest-tensorflow: - name: "Latest TensorFlow [dev]" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: - group: aws-general-8-plus - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v4 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-tensorflow-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-tensorflow-gpu - - - name: Post to Slack - if: always() - uses: huggingface/hf-workflows/.github/actions/post-slack@main - with: - slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} - title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build - status: ${{ job.status }} - slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} - latest-pytorch-deepspeed-amd: name: "PyTorch + DeepSpeed (AMD) [dev]" runs-on: diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml index f48d357cd5d..d6e36e90044 100644 --- a/.github/workflows/self-scheduled-caller.yml +++ b/.github/workflows/self-scheduled-caller.yml @@ -69,18 +69,6 @@ jobs: report_repo_id: hf-internal-testing/transformers_daily_ci secrets: inherit - tf-pipeline: - name: TF pipeline CI - uses: ./.github/workflows/self-scheduled.yml - with: - job: run_pipelines_tf_gpu - slack_report_channel: "#transformers-ci-daily-pipeline-tf" - runner: daily-ci - docker: huggingface/transformers-tensorflow-gpu - ci_event: Daily CI - report_repo_id: hf-internal-testing/transformers_daily_ci - secrets: inherit - example-ci: name: Example CI uses: ./.github/workflows/self-scheduled.yml diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 36c113190ca..5ad51bc008a 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -209,75 +209,6 @@ jobs: name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports path: /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports - run_pipelines_tf_gpu: - if: ${{ inputs.job == 'run_pipelines_tf_gpu' }} - name: TensorFlow pipelines - strategy: - fail-fast: false - matrix: - machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] - runs-on: - group: '${{ matrix.machine_type }}' - container: - image: huggingface/transformers-tensorflow-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Update clone - working-directory: /transformers - run: | - git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Set `machine_type` for report and artifact names - working-directory: /transformers - shell: bash - run: | - echo "${{ matrix.machine_type }}" - - if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then - machine_type=single-gpu - elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then - machine_type=multi-gpu - else - machine_type=${{ matrix.machine_type }} - fi - - echo "$machine_type" - echo "machine_type=$machine_type" >> $GITHUB_ENV - - - name: Run all pipeline tests on GPU - working-directory: /transformers - run: | - python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports tests/pipelines - - - name: Failure short reports - if: ${{ always() }} - run: | - cat /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports/failures_short.txt - - - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports - path: /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports - run_examples_gpu: if: ${{ inputs.job == 'run_examples_gpu' }} name: Examples directory @@ -571,7 +502,6 @@ jobs: run_models_gpu, run_trainer_and_fsdp_gpu, run_pipelines_torch_gpu, - run_pipelines_tf_gpu, run_examples_gpu, run_torch_cuda_extensions_gpu, run_quantization_torch_gpu, diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index f9e9aa17897..f0b43e23ec3 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -28,7 +28,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers && # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future. # 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`. # Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions). -RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 "tensorflow_text<2.16" "tensorflow_probability<0.22" && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA +RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability RUN python3 -m pip uninstall -y flax jax