From 3d34b92116c26518f476be8c40250c4d89de3cc3 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 20 Jun 2025 18:10:35 +0200 Subject: [PATCH] Switch to use A10 progressively (#38936) * try * fix * fix --------- Co-authored-by: ydshieh --- .github/workflows/model_jobs.yml | 6 +- .github/workflows/model_jobs_amd.yml | 128 -------------------- .github/workflows/self-scheduled-caller.yml | 8 +- .github/workflows/self-scheduled.yml | 10 +- utils/get_runner_map.py | 65 ++++++++++ 5 files changed, 73 insertions(+), 144 deletions(-) delete mode 100644 .github/workflows/model_jobs_amd.yml create mode 100644 utils/get_runner_map.py diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml index b88304f7868..c9096bec3c6 100644 --- a/.github/workflows/model_jobs.yml +++ b/.github/workflows/model_jobs.yml @@ -12,8 +12,8 @@ on: slice_id: required: true type: number - runner: - required: true + runner_map: + required: false type: string docker: required: true @@ -45,7 +45,7 @@ jobs: matrix: folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }} runs-on: - group: '${{ inputs.machine_type }}' + group: ${{ fromJson(inputs.runner_map)[matrix.folders][inputs.machine_type] }} container: image: ${{ inputs.docker }} options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ diff --git a/.github/workflows/model_jobs_amd.yml b/.github/workflows/model_jobs_amd.yml deleted file mode 100644 index c90181ec6f1..00000000000 --- a/.github/workflows/model_jobs_amd.yml +++ /dev/null @@ -1,128 +0,0 @@ -name: model jobs - -on: - workflow_call: - inputs: - folder_slices: - required: true - type: string - machine_type: - required: true - type: string - slice_id: - required: true - type: number - runner: - required: true - type: string - docker: - required: true - type: string - -env: - HF_HOME: /mnt/cache - TRANSFORMERS_IS_CI: yes - OMP_NUM_THREADS: 8 - MKL_NUM_THREADS: 8 - RUN_SLOW: yes - # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. - # This token is created under the bot `hf-transformers-bot`. - HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} - SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} - TF_FORCE_GPU_ALLOW_GROWTH: true - CUDA_VISIBLE_DEVICES: 0,1 - -jobs: - run_models_gpu: - name: " " - strategy: - max-parallel: 1 # For now, not to parallelize. Can change later if it works well. - fail-fast: false - matrix: - folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }} - runs-on: ['${{ inputs.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}'] - container: - image: ${{ inputs.docker }} - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Echo input and matrix info - shell: bash - run: | - echo "${{ inputs.folder_slices }}" - echo "${{ matrix.folders }}" - echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}" - - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: Update / Install some packages (for Past CI) - if: ${{ contains(inputs.docker, '-past-') }} - working-directory: /transformers - run: | - python3 -m pip install -U datasets - - - name: Update / Install some packages (for Past CI) - if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }} - working-directory: /transformers - run: | - python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate - - - name: ROCM-SMI - run: | - rocm-smi - - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: python3 -m pytest -rsfE -v --make-reports=${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test" - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt - - - name: Run test - shell: bash - run: | - mkdir -p /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports - echo "hello" > /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt - echo "${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports" - - - name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml index d6e36e90044..88eee91a3bc 100644 --- a/.github/workflows/self-scheduled-caller.yml +++ b/.github/workflows/self-scheduled-caller.yml @@ -22,7 +22,7 @@ on: default: "" -# Used for `push` to easily modiffy the target workflow runs to compare against +# Used for `push` to easily modify the target workflow runs to compare against env: prev_workflow_run_id: "" other_workflow_run_id: "" @@ -51,7 +51,6 @@ jobs: with: job: run_models_gpu slack_report_channel: "#transformers-ci-daily-models" - runner: daily-ci docker: huggingface/transformers-all-latest-gpu ci_event: Daily CI report_repo_id: hf-internal-testing/transformers_daily_ci @@ -63,7 +62,6 @@ jobs: with: job: run_pipelines_torch_gpu slack_report_channel: "#transformers-ci-daily-pipeline-torch" - runner: daily-ci docker: huggingface/transformers-pytorch-gpu ci_event: Daily CI report_repo_id: hf-internal-testing/transformers_daily_ci @@ -75,7 +73,6 @@ jobs: with: job: run_examples_gpu slack_report_channel: "#transformers-ci-daily-examples" - runner: daily-ci docker: huggingface/transformers-all-latest-gpu ci_event: Daily CI report_repo_id: hf-internal-testing/transformers_daily_ci @@ -87,7 +84,6 @@ jobs: with: job: run_trainer_and_fsdp_gpu slack_report_channel: "#transformers-ci-daily-training" - runner: daily-ci docker: huggingface/transformers-all-latest-gpu ci_event: Daily CI report_repo_id: hf-internal-testing/transformers_daily_ci @@ -99,7 +95,6 @@ jobs: with: job: run_torch_cuda_extensions_gpu slack_report_channel: "#transformers-ci-daily-training" - runner: daily-ci docker: huggingface/transformers-pytorch-deepspeed-latest-gpu ci_event: Daily CI working-directory-prefix: /workspace @@ -112,7 +107,6 @@ jobs: with: job: run_quantization_torch_gpu slack_report_channel: "#transformers-ci-daily-quantization" - runner: daily-ci docker: huggingface/transformers-quantization-latest-gpu ci_event: Daily CI report_repo_id: hf-internal-testing/transformers_daily_ci diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 5ad51bc008a..2ddf1071710 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -15,9 +15,6 @@ on: slack_report_channel: required: true type: string - runner: - required: true - type: string docker: required: true type: string @@ -62,6 +59,7 @@ jobs: outputs: folder_slices: ${{ steps.set-matrix.outputs.folder_slices }} slice_ids: ${{ steps.set-matrix.outputs.slice_ids }} + runner_map: ${{ steps.set-matrix.outputs.runner_map }} quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }} steps: - name: Update clone @@ -88,6 +86,7 @@ jobs: if [ "${{ inputs.job }}" = "run_models_gpu" ]; then echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT + echo "runner_map=$(python3 ../utils/get_runner_map.py)" >> $GITHUB_OUTPUT elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT @@ -111,14 +110,14 @@ jobs: strategy: fail-fast: false matrix: - machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] + machine_type: [single-gpu, multi-gpu] slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }} uses: ./.github/workflows/model_jobs.yml with: folder_slices: ${{ needs.setup.outputs.folder_slices }} machine_type: ${{ matrix.machine_type }} slice_id: ${{ matrix.slice_id }} - runner: ${{ inputs.runner }} + runner_map: ${{ needs.setup.outputs.runner_map }} docker: ${{ inputs.docker }} secrets: inherit @@ -136,7 +135,6 @@ jobs: folder_slices: ${{ needs.setup.outputs.folder_slices }} machine_type: ${{ matrix.machine_type }} slice_id: ${{ matrix.slice_id }} - runner: ${{ inputs.runner }} docker: ${{ inputs.docker }} report_name_prefix: run_trainer_and_fsdp_gpu secrets: inherit diff --git a/utils/get_runner_map.py b/utils/get_runner_map.py new file mode 100644 index 00000000000..7b36651165b --- /dev/null +++ b/utils/get_runner_map.py @@ -0,0 +1,65 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This script is used to get a map containing the information of runners to use in GitHub Actions workflow files. +This is meant to be a temporary file that helps us to switch progressively from T4 to A10 runners. + +The data is stored in a Hub repository [hf-internal-testing/transformers_daily_ci](https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/blob/main/runner_map.json). +Currently, in that file, we specify the models for which we want to run the tests with T4 runners to avoid many test failures showing on the CI reports. +We will work on the tests toward to use A10 for all CI jobs. +""" + +import os + +import requests + + +if __name__ == "__main__": + # T4 + t4_runners = { + "single-gpu": "aws-g4dn-4xlarge-cache", + "multi-gpu": "aws-g4dn-12xlarge-cache", + } + + # A10 + a10_runners = { + "single-gpu": "aws-g5-4xlarge-cache", + "multi-gpu": "aws-g5-12xlarge-cache", + } + + tests = os.getcwd() + model_tests = os.listdir(os.path.join(tests, "models")) + d1 = sorted(filter(os.path.isdir, os.listdir(tests))) + d2 = sorted(filter(os.path.isdir, [f"models/{x}" for x in model_tests])) + d1.remove("models") + d = d2 + d1 + + response = requests.get( + "https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/resolve/main/runner_map.json" + ) + # The models that we want to run with T4 runners + jobs_using_t4 = response.json() + + runner_map = {} + for key in d: + modified_key = key + if modified_key.startswith("models/"): + modified_key = key[len("models/") :] + if modified_key in jobs_using_t4: + runner_map[key] = t4_runners + else: + runner_map[key] = a10_runners + + print(runner_map)