From d4564df1d4a6b355779f1a8ac250cb47cb4c38d8 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 20 Jun 2024 18:57:24 +0200 Subject: [PATCH] Revive Nightly/Past CI (#31159) * build * build * build * build --------- Co-authored-by: ydshieh --- .../build-nightly-ci-docker-images.yml | 20 - .github/workflows/model_jobs.yml | 23 +- .github/workflows/self-nightly-caller.yml | 43 +++ .../workflows/self-nightly-past-ci-caller.yml | 88 +++-- .github/workflows/self-nightly-scheduled.yml | 290 -------------- .github/workflows/self-past-caller.yml | 40 ++ .github/workflows/self-past.yml | 357 ------------------ .github/workflows/self-scheduled-caller.yml | 19 + .github/workflows/self-scheduled.yml | 73 +++- .github/workflows/slack-report.yml | 7 +- utils/notification_service.py | 13 +- utils/notification_service_quantization.py | 2 +- 12 files changed, 233 insertions(+), 742 deletions(-) create mode 100644 .github/workflows/self-nightly-caller.yml delete mode 100644 .github/workflows/self-nightly-scheduled.yml create mode 100644 .github/workflows/self-past-caller.yml delete mode 100644 .github/workflows/self-past.yml diff --git a/.github/workflows/build-nightly-ci-docker-images.yml b/.github/workflows/build-nightly-ci-docker-images.yml index 691369c765a..0b1b7df5f8a 100644 --- a/.github/workflows/build-nightly-ci-docker-images.yml +++ b/.github/workflows/build-nightly-ci-docker-images.yml @@ -15,16 +15,6 @@ jobs: name: "Nightly PyTorch + Stable TensorFlow" runs-on: [intel-cpu, 8-cpu, ci] steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 @@ -52,16 +42,6 @@ jobs: name: "Nightly PyTorch + DeepSpeed" runs-on: [intel-cpu, 8-cpu, ci] steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml index 840df8b6979..454d03f4245 100644 --- a/.github/workflows/model_jobs.yml +++ b/.github/workflows/model_jobs.yml @@ -12,6 +12,12 @@ on: slice_id: required: true type: number + runner: + required: true + type: string + docker: + required: true + type: string env: HF_HOME: /mnt/cache @@ -31,12 +37,13 @@ jobs: run_models_gpu: name: " " strategy: + max-parallel: 8 fail-fast: false matrix: folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }} - runs-on: ['${{ inputs.machine_type }}', nvidia-gpu, t4, daily-ci] + runs-on: ['${{ inputs.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}'] container: - image: huggingface/transformers-all-latest-gpu + image: ${{ inputs.docker }} options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Echo input and matrix info @@ -65,6 +72,18 @@ jobs: working-directory: /transformers run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: Update / Install some packages (for Past CI) + if: ${{ contains(inputs.docker, '-past-') }} + working-directory: /transformers + run: | + python3 -m pip install -U datasets + + - name: Update / Install some packages (for Past CI) + if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }} + working-directory: /transformers + run: | + python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate + - name: NVIDIA-SMI run: | nvidia-smi diff --git a/.github/workflows/self-nightly-caller.yml b/.github/workflows/self-nightly-caller.yml new file mode 100644 index 00000000000..5538e2d56e7 --- /dev/null +++ b/.github/workflows/self-nightly-caller.yml @@ -0,0 +1,43 @@ +name: Self-hosted runner (nightly-ci) + + +on: + repository_dispatch: + schedule: + - cron: "17 2 * * *" + push: + branches: + - run_nightly_ci* + +jobs: + build_nightly_ci_images: + name: Build Nightly CI Docker Images + if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci')) + uses: ./.github/workflows/build-nightly-ci-docker-images.yml + secrets: inherit + + model-ci: + name: Model CI + needs: [build_nightly_ci_images] + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_models_gpu + slack_report_channel: "#transformers-ci-past-future" + runner: ci + docker: huggingface/transformers-all-latest-torch-nightly-gpu + ci_event: Nightly CI + secrets: inherit + + deepspeed-ci: + name: DeepSpeed CI + needs: [build_nightly_ci_images] + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_torch_cuda_extensions_gpu + slack_report_channel: "#transformers-ci-past-future" + runner: ci + # test deepspeed nightly build with the latest release torch + docker: huggingface/transformers-pytorch-deepspeed-latest-gpu + ci_event: Nightly CI + working-directory-prefix: /workspace + secrets: inherit diff --git a/.github/workflows/self-nightly-past-ci-caller.yml b/.github/workflows/self-nightly-past-ci-caller.yml index 67840355960..142399a6366 100644 --- a/.github/workflows/self-nightly-past-ci-caller.yml +++ b/.github/workflows/self-nightly-past-ci-caller.yml @@ -2,32 +2,30 @@ name: Self-hosted runner (nightly-past-ci-caller) on: schedule: - # 2:17 am on each Sunday and Thursday - - - cron: "17 2 * * 0,4" + - cron: "17 2,14 * * *" push: branches: - - run_nightly_ci* - run_past_ci* jobs: - build_nightly_ci_images: - name: Build Nightly CI Docker Images - if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci')) - uses: ./.github/workflows/build-nightly-ci-docker-images.yml - secrets: inherit - - run_nightly_ci: - name: Nightly CI - needs: [build_nightly_ci_images] - uses: ./.github/workflows/self-nightly-scheduled.yml - secrets: inherit + get_number: + name: Get number + runs-on: ubuntu-22.04 + outputs: + run_number: ${{ steps.get_number.outputs.run_number }} + steps: + - name: Get number + id: get_number + run: | + echo "${{ github.run_number }}" + echo "$(python3 -c 'print(int(${{ github.run_number }}) % 10)')" + echo "run_number=$(python3 -c 'print(int(${{ github.run_number }}) % 10)')" >> $GITHUB_OUTPUT run_past_ci_pytorch_1-13: name: PyTorch 1.13 - if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) - needs: [run_nightly_ci] - uses: ./.github/workflows/self-past.yml + needs: get_number + if: needs.get_number.outputs.run_number == 0 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + uses: ./.github/workflows/self-past-caller.yml with: framework: pytorch version: "1.13" @@ -36,9 +34,9 @@ jobs: run_past_ci_pytorch_1-12: name: PyTorch 1.12 - if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) - needs: [run_past_ci_pytorch_1-13] - uses: ./.github/workflows/self-past.yml + needs: get_number + if: needs.get_number.outputs.run_number == 1 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + uses: ./.github/workflows/self-past-caller.yml with: framework: pytorch version: "1.12" @@ -47,9 +45,9 @@ jobs: run_past_ci_pytorch_1-11: name: PyTorch 1.11 - if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) - needs: [run_past_ci_pytorch_1-12] - uses: ./.github/workflows/self-past.yml + needs: get_number + if: needs.get_number.outputs.run_number == 2 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + uses: ./.github/workflows/self-past-caller.yml with: framework: pytorch version: "1.11" @@ -58,9 +56,9 @@ jobs: run_past_ci_tensorflow_2-11: name: TensorFlow 2.11 - if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) - needs: [run_past_ci_pytorch_1-11] - uses: ./.github/workflows/self-past.yml + needs: get_number + if: needs.get_number.outputs.run_number == 3 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + uses: ./.github/workflows/self-past-caller.yml with: framework: tensorflow version: "2.11" @@ -69,9 +67,9 @@ jobs: run_past_ci_tensorflow_2-10: name: TensorFlow 2.10 - if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) - needs: [run_past_ci_tensorflow_2-11] - uses: ./.github/workflows/self-past.yml + needs: get_number + if: needs.get_number.outputs.run_number == 4 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + uses: ./.github/workflows/self-past-caller.yml with: framework: tensorflow version: "2.10" @@ -80,9 +78,9 @@ jobs: run_past_ci_tensorflow_2-9: name: TensorFlow 2.9 - if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) - needs: [run_past_ci_tensorflow_2-10] - uses: ./.github/workflows/self-past.yml + needs: get_number + if: needs.get_number.outputs.run_number == 5 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + uses: ./.github/workflows/self-past-caller.yml with: framework: tensorflow version: "2.9" @@ -91,9 +89,9 @@ jobs: run_past_ci_tensorflow_2-8: name: TensorFlow 2.8 - if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) - needs: [run_past_ci_tensorflow_2-9] - uses: ./.github/workflows/self-past.yml + needs: get_number + if: needs.get_number.outputs.run_number == 6 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + uses: ./.github/workflows/self-past-caller.yml with: framework: tensorflow version: "2.8" @@ -102,9 +100,9 @@ jobs: run_past_ci_tensorflow_2-7: name: TensorFlow 2.7 - if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) - needs: [run_past_ci_tensorflow_2-8] - uses: ./.github/workflows/self-past.yml + needs: get_number + if: needs.get_number.outputs.run_number == 7 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + uses: ./.github/workflows/self-past-caller.yml with: framework: tensorflow version: "2.7" @@ -113,9 +111,9 @@ jobs: run_past_ci_tensorflow_2-6: name: TensorFlow 2.6 - if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) - needs: [run_past_ci_tensorflow_2-7] - uses: ./.github/workflows/self-past.yml + needs: get_number + if: needs.get_number.outputs.run_number == 8 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + uses: ./.github/workflows/self-past-caller.yml with: framework: tensorflow version: "2.6" @@ -124,9 +122,9 @@ jobs: run_past_ci_tensorflow_2-5: name: TensorFlow 2.5 - if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) - needs: [run_past_ci_tensorflow_2-6] - uses: ./.github/workflows/self-past.yml + needs: get_number + if: needs.get_number.outputs.run_number == 9 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + uses: ./.github/workflows/self-past-caller.yml with: framework: tensorflow version: "2.5" diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml deleted file mode 100644 index 875e715b068..00000000000 --- a/.github/workflows/self-nightly-scheduled.yml +++ /dev/null @@ -1,290 +0,0 @@ -name: Self-hosted runner (nightly-ci) - -# Note that each job's dependencies go into a corresponding docker file. -# -# For example for `run_torch_cuda_extensions_gpu` the docker image is -# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at -# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile` - -on: - repository_dispatch: - workflow_call: - -env: - HF_HOME: /mnt/cache - TRANSFORMERS_IS_CI: yes - OMP_NUM_THREADS: 8 - MKL_NUM_THREADS: 8 - RUN_SLOW: yes - HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} - SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} - TF_FORCE_GPU_ALLOW_GROWTH: true - RUN_PT_TF_CROSS_TESTS: 1 - CUDA_VISIBLE_DEVICES: 0,1 - -jobs: - setup: - name: Setup - strategy: - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] - container: - image: huggingface/transformers-all-latest-torch-nightly-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - steps: - - name: Update clone - working-directory: /transformers - run: | - git fetch && git checkout ${{ github.sha }} - - - name: Cleanup - working-directory: /transformers - run: | - rm -rf tests/__pycache__ - rm -rf tests/models/__pycache__ - rm -rf reports - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - id: set-matrix - name: Identify models to test - working-directory: /transformers/tests - run: | - echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT - - - name: NVIDIA-SMI - run: | - nvidia-smi - - run_tests_single_gpu: - name: Model tests - strategy: - fail-fast: false - matrix: - folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [single-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] - container: - image: huggingface/transformers-all-latest-torch-nightly-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - run_tests_multi_gpu: - name: Model tests - strategy: - fail-fast: false - matrix: - folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] - container: - image: huggingface/transformers-all-latest-torch-nightly-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - run_torch_cuda_extensions_gpu: - name: Torch CUDA extension tests - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] - needs: setup - container: - image: huggingface/transformers-pytorch-deepspeed-nightly-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Update clone - working-directory: /workspace/transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /workspace/transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: Remove cached torch extensions - run: rm -rf /github/home/.cache/torch_extensions/ - - # To avoid unknown test failures - - name: Pre build DeepSpeed *again* - working-directory: /workspace - run: | - python3 -m pip uninstall -y deepspeed - rm -rf DeepSpeed - git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /workspace/transformers - run: | - python utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /workspace/transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /workspace/transformers - run: | - python -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports_postfix_nightly" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports_postfix_nightly - path: /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports - - send_results: - name: Send results to webhook - runs-on: ubuntu-22.04 - if: always() - needs: [ - setup, - run_tests_single_gpu, - run_tests_multi_gpu, - run_torch_cuda_extensions_gpu - ] - steps: - - name: Preliminary job status - shell: bash - # For the meaning of these environment variables, see the job `Setup` - run: | - echo "Setup status: ${{ needs.setup.result }}" - - - uses: actions/checkout@v4 - - uses: actions/download-artifact@v4 - - name: Send message to Slack - env: - CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} - CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} - CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} - CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} - CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }} - ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} - CI_EVENT: Nightly CI - SETUP_STATUS: ${{ needs.setup.result }} - # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change - # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. - run: | - pip install slack_sdk - pip show slack_sdk - python utils/notification_service.py "${{ needs.setup.outputs.matrix }}" - - - # delete-artifact - - uses: geekyeggo/delete-artifact@v2 - with: - name: | - single-* - multi-* diff --git a/.github/workflows/self-past-caller.yml b/.github/workflows/self-past-caller.yml new file mode 100644 index 00000000000..1929a01c34d --- /dev/null +++ b/.github/workflows/self-past-caller.yml @@ -0,0 +1,40 @@ +name: Self-hosted runner (past-ci) + + +on: + workflow_call: + inputs: + framework: + required: true + type: string + version: + required: true + type: string + # Use this to control the commit to test against + sha: + default: 'main' + required: false + type: string + +jobs: + model-ci: + name: Model CI + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_models_gpu + slack_report_channel: "#transformers-ci-past-future" + runner: past-ci + docker: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu + ci_event: Past CI - ${{ inputs.framework }}-${{ inputs.version }} + secrets: inherit + + deepspeed-ci: + name: DeepSpeed CI + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_torch_cuda_extensions_gpu + slack_report_channel: "#transformers-ci-past-future" + runner: past-ci + docker: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu + ci_event: Past CI - ${{ inputs.framework }}-${{ inputs.version }} + secrets: inherit diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml deleted file mode 100644 index ca47c454f68..00000000000 --- a/.github/workflows/self-past.yml +++ /dev/null @@ -1,357 +0,0 @@ -name: Self-hosted runner (past-ci) - -# Note that each job's dependencies go into a corresponding docker file. -# -# For example for `run_torch_cuda_extensions_gpu` the docker image is -# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at -# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile` - -on: - workflow_call: - inputs: - framework: - required: true - type: string - version: - required: true - type: string - # Use this to control the commit to test against - sha: - default: 'main' - required: false - type: string - -env: - HF_HOME: /mnt/cache - TRANSFORMERS_IS_CI: yes - OMP_NUM_THREADS: 8 - MKL_NUM_THREADS: 8 - RUN_SLOW: yes - HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} - SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} - TF_FORCE_GPU_ALLOW_GROWTH: true - RUN_PT_TF_CROSS_TESTS: 1 - CUDA_VISIBLE_DEVICES: 0,1 - -jobs: - setup: - name: Setup - strategy: - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] - container: - image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ inputs.sha }} - - - name: Cleanup - working-directory: /transformers - run: | - rm -rf tests/__pycache__ - rm -rf tests/models/__pycache__ - rm -rf reports - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - id: set-matrix - working-directory: /transformers - name: Identify models to test - run: | - cd tests - echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT - - run_tests_single_gpu: - name: Model tests - strategy: - fail-fast: false - matrix: - folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [single-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] - container: - image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ inputs.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: Update some packages - working-directory: /transformers - run: python3 -m pip install -U datasets - - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Install - if: inputs.framework == 'pytorch' - working-directory: /transformers - run: | - python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - - name: Save job name - if: ${{ always() }} - shell: bash - run: | - matrix_folders=${matrix_folders/'models_'/'models/'} - job_name="Model tests ($matrix_folders, ${{ matrix.machine_type }})" - echo "$job_name" - echo "$job_name" > /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/job_name.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }} - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - run_tests_multi_gpu: - name: Model tests - strategy: - fail-fast: false - matrix: - folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] - container: - image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ inputs.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: Update some packages - working-directory: /transformers - run: python3 -m pip install -U datasets - - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Install - if: inputs.framework == 'pytorch' - working-directory: /transformers - run: | - python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - - name: Save job name - if: ${{ always() }} - shell: bash - run: | - matrix_folders=${matrix_folders/'models_'/'models/'} - job_name="Model tests ($matrix_folders, ${{ matrix.machine_type }})" - echo "$job_name" - echo "$job_name" > /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/job_name.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }} - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - run_torch_cuda_extensions_gpu: - name: Torch CUDA extension tests - if: inputs.framework == 'pytorch' - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci] - needs: setup - container: - image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: Update some packages - working-directory: /transformers - run: python3 -m pip install -U datasets - - - name: Install - working-directory: /transformers - run: | - python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate - - - name: Remove cached torch extensions - run: rm -rf /github/home/.cache/torch_extensions/ - - # To avoid unknown test failures - - name: Pre build DeepSpeed *again* - working-directory: / - run: | - python3 -m pip uninstall -y deepspeed - rm -rf DeepSpeed - git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: | - python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }} - path: /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports - - send_results: - name: Send results to webhook - runs-on: ubuntu-22.04 - if: always() - needs: [ - setup, - run_tests_single_gpu, - run_tests_multi_gpu, - run_torch_cuda_extensions_gpu - ] - steps: - - name: Preliminary job status - shell: bash - # For the meaning of these environment variables, see the job `Setup` - run: | - echo "Setup status: ${{ needs.setup.result }}" - - - uses: actions/checkout@v4 - - uses: actions/download-artifact@v4 - - # Create a directory to store test failure tables in the next step - - name: Create directory - run: mkdir test_failure_tables - - - name: Send message to Slack - env: - CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} - CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} - CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} - CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} - CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }} - ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} - CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }} - SETUP_STATUS: ${{ needs.setup.result }} - # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change - # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. - run: | - pip install slack_sdk - pip show slack_sdk - python utils/notification_service.py "${{ needs.setup.outputs.matrix }}" - - # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. - - name: Failure table artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: test_failure_tables_${{ inputs.framework }}-${{ inputs.version }} - path: test_failure_tables - - # delete-artifact - - uses: geekyeggo/delete-artifact@v2 - with: - name: | - single-* - multi-* diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml index 40689c629a0..75ea3bb24bc 100644 --- a/.github/workflows/self-scheduled-caller.yml +++ b/.github/workflows/self-scheduled-caller.yml @@ -16,6 +16,9 @@ jobs: with: job: run_models_gpu slack_report_channel: "#transformers-ci-daily-models" + runner: daily-ci + docker: huggingface/transformers-all-latest-gpu + ci_event: Daily CI secrets: inherit torch-pipeline: @@ -24,6 +27,9 @@ jobs: with: job: run_pipelines_torch_gpu slack_report_channel: "#transformers-ci-daily-pipeline-torch" + runner: daily-ci + docker: huggingface/transformers-pytorch-gpu + ci_event: Daily CI secrets: inherit tf-pipeline: @@ -32,6 +38,9 @@ jobs: with: job: run_pipelines_tf_gpu slack_report_channel: "#transformers-ci-daily-pipeline-tf" + runner: daily-ci + docker: huggingface/transformers-tensorflow-gpu + ci_event: Daily CI secrets: inherit example-ci: @@ -40,6 +49,9 @@ jobs: with: job: run_examples_gpu slack_report_channel: "#transformers-ci-daily-examples" + runner: daily-ci + docker: huggingface/transformers-all-latest-gpu + ci_event: Daily CI secrets: inherit deepspeed-ci: @@ -48,6 +60,10 @@ jobs: with: job: run_torch_cuda_extensions_gpu slack_report_channel: "#transformers-ci-daily-deepspeed" + runner: daily-ci + docker: huggingface/transformers-pytorch-deepspeed-latest-gpu + ci_event: Daily CI + working-directory-prefix: /workspace secrets: inherit quantization-ci: @@ -56,4 +72,7 @@ jobs: with: job: run_quantization_torch_gpu slack_report_channel: "#transformers-ci-daily-quantization" + runner: daily-ci + docker: huggingface/transformers-quantization-latest-gpu + ci_event: Daily CI secrets: inherit diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 5911c81bf4f..b056759aa77 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -15,6 +15,19 @@ on: slack_report_channel: required: true type: string + runner: + required: true + type: string + docker: + required: true + type: string + ci_event: + required: true + type: string + working-directory-prefix: + default: '' + required: false + type: string env: HF_HOME: /mnt/cache @@ -38,7 +51,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}'] container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -96,6 +109,8 @@ jobs: folder_slices: ${{ needs.setup.outputs.folder_slices }} machine_type: ${{ matrix.machine_type }} slice_id: ${{ matrix.slice_id }} + runner: ${{ inputs.runner }} + docker: ${{ inputs.docker }} secrets: inherit run_pipelines_torch_gpu: @@ -105,7 +120,7 @@ jobs: fail-fast: false matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}'] container: image: huggingface/transformers-pytorch-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -155,7 +170,7 @@ jobs: fail-fast: false matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}'] container: image: huggingface/transformers-tensorflow-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -206,7 +221,7 @@ jobs: fail-fast: false matrix: machine_type: [single-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}'] container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -257,69 +272,88 @@ jobs: fail-fast: false matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}'] container: - image: huggingface/transformers-pytorch-deepspeed-latest-gpu + image: ${{ inputs.docker }} options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Update clone - working-directory: /workspace/transformers + working-directory: ${{ inputs.working-directory-prefix }}/transformers run: git fetch && git checkout ${{ github.sha }} - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /workspace/transformers + working-directory: ${{ inputs.working-directory-prefix }}/transformers run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: Update / Install some packages (for Past CI) + if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }} + working-directory: ${{ inputs.working-directory-prefix }}/transformers + run: | + python3 -m pip install -U datasets + python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate + - name: Remove cached torch extensions run: rm -rf /github/home/.cache/torch_extensions/ # To avoid unknown test failures - - name: Pre build DeepSpeed *again* - working-directory: /workspace + - name: Pre build DeepSpeed *again* (for daily CI) + if: ${{ contains(inputs.ci_event, 'Daily CI') }} + working-directory: ${{ inputs.working-directory-prefix }}/ run: | python3 -m pip uninstall -y deepspeed DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + # To avoid unknown test failures + - name: Pre build DeepSpeed *again* (for nightly & Past CI) + if: ${{ contains(inputs.ci_event, 'Nightly CI') || contains(inputs.ci_event, 'Past CI') }} + working-directory: ${{ inputs.working-directory-prefix }}/ + run: | + python3 -m pip uninstall -y deepspeed + rm -rf DeepSpeed + git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + - name: NVIDIA-SMI run: | nvidia-smi - name: Environment - working-directory: /workspace/transformers + working-directory: ${{ inputs.working-directory-prefix }}/transformers run: | - python utils/print_env.py + python3 utils/print_env.py - name: Show installed libraries and their versions - working-directory: /workspace/transformers + working-directory: ${{ inputs.working-directory-prefix }}/transformers run: pip freeze - name: Run all tests on GPU - working-directory: /workspace/transformers + working-directory: ${{ inputs.working-directory-prefix }}/transformers run: | - python -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended + python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt + run: cat ${{ inputs.working-directory-prefix }}/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports" if: ${{ always() }} uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports - path: /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports + path: ${{ inputs.working-directory-prefix }}/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports run_quantization_torch_gpu: if: ${{ inputs.job == 'run_quantization_torch_gpu' }} name: " " needs: setup strategy: + max-parallel: 4 fail-fast: false matrix: folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }} machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}'] container: image: huggingface/transformers-quantization-latest-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -434,5 +468,6 @@ jobs: # This would be an empty string if `setup` is skipped. folder_slices: ${{ needs.setup.outputs.folder_slices }} quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }} - + ci_event: ${{ inputs.ci_event }} + secrets: inherit diff --git a/.github/workflows/slack-report.yml b/.github/workflows/slack-report.yml index 9339e6a7b45..ee2962ba89c 100644 --- a/.github/workflows/slack-report.yml +++ b/.github/workflows/slack-report.yml @@ -18,6 +18,9 @@ on: quantization_matrix: required: true type: string + ci_event: + required: true + type: string env: TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} @@ -45,7 +48,7 @@ jobs: CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }} ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} - CI_EVENT: scheduled + CI_EVENT: ${{ inputs.ci_event }} CI_SHA: ${{ github.sha }} CI_WORKFLOW_REF: ${{ github.workflow_ref }} CI_TEST_JOB: ${{ inputs.job }} @@ -76,7 +79,7 @@ jobs: CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }} - CI_EVENT: scheduled + CI_EVENT: ${{ inputs.ci_event }} CI_SHA: ${{ github.sha }} CI_TEST_JOB: ${{ inputs.job }} SETUP_STATUS: ${{ inputs.setup_status }} diff --git a/utils/notification_service.py b/utils/notification_service.py index 7f3bbd61f5f..3be412d09da 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -641,7 +641,7 @@ class Message: def get_new_model_failure_blocks(self, with_header=True, to_truncate=True): if self.prev_ci_artifacts is None: - return {} + return [] sorted_dict = sorted(self.model_results.items(), key=lambda t: t[0]) @@ -767,10 +767,11 @@ class Message: # To save the list of new model failures blocks = self.get_new_model_failure_blocks(to_truncate=False) - failure_text = blocks[-1]["text"]["text"] - file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/new_model_failures.txt") - with open(file_path, "w", encoding="UTF-8") as fp: - fp.write(failure_text) + if blocks: + failure_text = blocks[-1]["text"]["text"] + file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/new_model_failures.txt") + with open(file_path, "w", encoding="UTF-8") as fp: + fp.write(failure_text) def retrieve_artifact(artifact_path: str, gpu: Optional[str]): @@ -891,7 +892,7 @@ if __name__ == "__main__": # To find the PR number in a commit title, for example, `Add AwesomeFormer model (#99999)` pr_number_re = re.compile(r"\(#(\d+)\)$") - title = f"🤗 Results of the {ci_event} tests." + title = f"🤗 Results of {ci_event} - {os.getenv('CI_TEST_JOB')}." # Add Commit/PR title with a link for push CI # (check the title in 2 env. variables - depending on the CI is triggered via `push` or `workflow_run` event) ci_title_push = os.environ.get("CI_TITLE_PUSH") diff --git a/utils/notification_service_quantization.py b/utils/notification_service_quantization.py index 32109765407..0264797c94e 100644 --- a/utils/notification_service_quantization.py +++ b/utils/notification_service_quantization.py @@ -175,7 +175,7 @@ if __name__ == "__main__": # This env. variable is set in workflow file (under the job `send_results`). ci_event = os.environ["CI_EVENT"] - title = f"🤗 Results of the {ci_event} tests." + title = f"🤗 Results of the {ci_event} - {os.getenv('CI_TEST_JOB')}." if setup_failed: Message.error_out(