Switch to use A10 progressively (#38936)

* try * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-07-03 12:50:06 +06:00 · 2025-06-20 18:10:35 +02:00 · 2025-06-20 18:10:35 +02:00 · 3d34b92116
commit 3d34b92116
parent b8059e1f8f
5 changed files with 73 additions and 144 deletions
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -12,8 +12,8 @@ on:
      slice_id:
        required: true
        type: number
-      runner:
+      runner_map:
-        required: true
+        required: false
        type: string
      docker:
        required: true
@ -45,7 +45,7 @@ jobs:
      matrix:
        folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
    runs-on:
-      group: '${{ inputs.machine_type }}'
+      group: ${{ fromJson(inputs.runner_map)[matrix.folders][inputs.machine_type] }}
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/model_jobs_amd.yml
+++ b/.github/workflows/model_jobs_amd.yml
@ -1,128 +0,0 @@
 name: model jobs
 on:
  workflow_call:
    inputs:
      folder_slices:
        required: true
        type: string
      machine_type:
        required: true
        type: string
      slice_id:
        required: true
        type: number
      runner:
        required: true
        type: string
      docker:
        required: true
        type: string
 env:
  HF_HOME: /mnt/cache
  TRANSFORMERS_IS_CI: yes
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
  RUN_SLOW: yes
  # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
  # This token is created under the bot `hf-transformers-bot`.
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true
  CUDA_VISIBLE_DEVICES: 0,1
 jobs:
  run_models_gpu:
    name: " "
    strategy:
      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
      fail-fast: false
      matrix:
        folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
    runs-on: ['${{ inputs.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
      image: ${{ inputs.docker }}
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Echo input and matrix info
        shell: bash
        run: |
          echo "${{ inputs.folder_slices }}"
          echo "${{ matrix.folders }}"
          echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
      - name: Echo folder ${{ matrix.folders }}
        shell: bash
        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
        # set the artifact folder names (because the character `/` is not allowed).
        run: |
          echo "${{ matrix.folders }}"
          matrix_folders=${{ matrix.folders }}
          matrix_folders=${matrix_folders/'models/'/'models_'}
          echo "$matrix_folders"
          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
      - name: Update clone
        working-directory: /transformers
        run: git fetch && git checkout ${{ github.sha }}
      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
      - name: Update / Install some packages (for Past CI)
        if: ${{ contains(inputs.docker, '-past-') }}
        working-directory: /transformers
        run: |
          python3 -m pip install -U datasets
      - name: Update / Install some packages (for Past CI)
        if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }}
        working-directory: /transformers
        run: |
          python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
      - name: ROCM-SMI
        run: |
          rocm-smi
      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14
      - name: Show ROCR environment
        run: |
          echo "ROCR: $ROCR_VISIBLE_DEVICES"
      - name: Environment
        working-directory: /transformers
        run: |
          python3 utils/print_env.py
      - name: Show installed libraries and their versions
        working-directory: /transformers
        run: pip freeze
      - name: Run all tests on GPU
        working-directory: /transformers
        run: python3 -m pytest -rsfE -v --make-reports=${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}  -m "not not_device_test"
      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: cat /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
      - name: Run test
        shell: bash
        run: |
          mkdir -p /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
          echo "hello" > /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
          echo "${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
      - name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
          path: /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -22,7 +22,7 @@ on:
        default: ""
-# Used for `push` to easily modiffy the target workflow runs to compare against
+# Used for `push` to easily modify the target workflow runs to compare against
 env:
    prev_workflow_run_id: ""
    other_workflow_run_id: ""
@ -51,7 +51,6 @@ jobs:
    with:
      job: run_models_gpu
      slack_report_channel: "#transformers-ci-daily-models"
      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
@ -63,7 +62,6 @@ jobs:
    with:
      job: run_pipelines_torch_gpu
      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
      runner: daily-ci
      docker: huggingface/transformers-pytorch-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
@ -75,7 +73,6 @@ jobs:
    with:
      job: run_examples_gpu
      slack_report_channel: "#transformers-ci-daily-examples"
      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
@ -87,7 +84,6 @@ jobs:
    with:
      job: run_trainer_and_fsdp_gpu
      slack_report_channel: "#transformers-ci-daily-training"
      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
@ -99,7 +95,6 @@ jobs:
    with:
      job: run_torch_cuda_extensions_gpu
      slack_report_channel: "#transformers-ci-daily-training"
      runner: daily-ci
      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
      ci_event: Daily CI
      working-directory-prefix: /workspace
@ -112,7 +107,6 @@ jobs:
    with:
      job: run_quantization_torch_gpu
      slack_report_channel: "#transformers-ci-daily-quantization"
      runner: daily-ci
      docker: huggingface/transformers-quantization-latest-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -15,9 +15,6 @@ on:
      slack_report_channel:
        required: true
        type: string
      runner:
        required: true
        type: string
      docker:
        required: true
        type: string
@ -62,6 +59,7 @@ jobs:
    outputs:
      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
      runner_map: ${{ steps.set-matrix.outputs.runner_map }}
      quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }}
    steps:
      - name: Update clone
@ -88,6 +86,7 @@ jobs:
          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
            echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
            echo "runner_map=$(python3 ../utils/get_runner_map.py)" >> $GITHUB_OUTPUT
          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
@ -111,14 +110,14 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [single-gpu, multi-gpu]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs.yml
    with:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      machine_type: ${{ matrix.machine_type }}
      slice_id: ${{ matrix.slice_id }}
-      runner: ${{ inputs.runner }}
+      runner_map: ${{ needs.setup.outputs.runner_map }}
      docker: ${{ inputs.docker }}
    secrets: inherit
@ -136,7 +135,6 @@ jobs:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      machine_type: ${{ matrix.machine_type }}
      slice_id: ${{ matrix.slice_id }}
      runner: ${{ inputs.runner }}
      docker: ${{ inputs.docker }}
      report_name_prefix: run_trainer_and_fsdp_gpu
    secrets: inherit
--- a/utils/get_runner_map.py
+++ b/utils/get_runner_map.py
@ -0,0 +1,65 @@
 # Copyright 2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script is used to get a map containing the information of runners to use in GitHub Actions workflow files.
 This is meant to be a temporary file that helps us to switch progressively from T4 to A10 runners.
 The data is stored in a Hub repository [hf-internal-testing/transformers_daily_ci](https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/blob/main/runner_map.json).
 Currently, in that file, we specify the models for which we want to run the tests with T4 runners to avoid many test failures showing on the CI reports.
 We will work on the tests toward to use A10 for all CI jobs.
 """
 import os
 import requests
 if __name__ == "__main__":
    # T4
    t4_runners = {
        "single-gpu": "aws-g4dn-4xlarge-cache",
        "multi-gpu": "aws-g4dn-12xlarge-cache",
    }
    # A10
    a10_runners = {
        "single-gpu": "aws-g5-4xlarge-cache",
        "multi-gpu": "aws-g5-12xlarge-cache",
    }
    tests = os.getcwd()
    model_tests = os.listdir(os.path.join(tests, "models"))
    d1 = sorted(filter(os.path.isdir, os.listdir(tests)))
    d2 = sorted(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))
    d1.remove("models")
    d = d2 + d1
    response = requests.get(
        "https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/resolve/main/runner_map.json"
    )
    # The models that we want to run with T4 runners
    jobs_using_t4 = response.json()
    runner_map = {}
    for key in d:
        modified_key = key
        if modified_key.startswith("models/"):
            modified_key = key[len("models/") :]
        if modified_key in jobs_using_t4:
            runner_map[key] = t4_runners
        else:
            runner_map[key] = a10_runners
    print(runner_map)