From 7819911b0c4f490fbe72d45f37e4613c7cb78bdf Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Mon, 5 May 2025 16:17:45 +0200 Subject: [PATCH] Use T4 single GPU runner with more CPU RAM (#37961) larger T4 single GPU Co-authored-by: ydshieh --- .../workflows/check_failed_model_tests.yml | 2 +- .github/workflows/doctest_job.yml | 2 +- .github/workflows/doctests.yml | 2 +- .github/workflows/model_jobs.yml | 2 +- .github/workflows/self-comment-ci.yml | 8 +++--- .github/workflows/self-scheduled.yml | 26 +++++++++---------- .github/workflows/ssh-runner.yml | 2 +- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/.github/workflows/check_failed_model_tests.yml b/.github/workflows/check_failed_model_tests.yml index 5963523fd76..8366707845c 100644 --- a/.github/workflows/check_failed_model_tests.yml +++ b/.github/workflows/check_failed_model_tests.yml @@ -29,7 +29,7 @@ jobs: run_models_gpu: name: " " runs-on: - group: aws-g4dn-2xlarge-cache + group: aws-g4dn-4xlarge-cache container: image: ${{ inputs.docker }} options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ diff --git a/.github/workflows/doctest_job.yml b/.github/workflows/doctest_job.yml index eb62b797b8e..b881bc38e59 100644 --- a/.github/workflows/doctest_job.yml +++ b/.github/workflows/doctest_job.yml @@ -28,7 +28,7 @@ jobs: matrix: split_keys: ${{ fromJson(inputs.split_keys) }} runs-on: - group: aws-g4dn-2xlarge-cache + group: aws-g4dn-4xlarge-cache container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml index 472b07684ed..bdf967a6d9e 100644 --- a/.github/workflows/doctests.yml +++ b/.github/workflows/doctests.yml @@ -15,7 +15,7 @@ jobs: setup: name: Setup runs-on: - group: aws-g4dn-2xlarge-cache + group: aws-g4dn-4xlarge-cache container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml index 95584176d6c..b88304f7868 100644 --- a/.github/workflows/model_jobs.yml +++ b/.github/workflows/model_jobs.yml @@ -107,7 +107,7 @@ jobs: run: | echo "${{ inputs.machine_type }}" - if [ "${{ inputs.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + if [ "${{ inputs.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then machine_type=single-gpu elif [ "${{ inputs.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then machine_type=multi-gpu diff --git a/.github/workflows/self-comment-ci.yml b/.github/workflows/self-comment-ci.yml index 6567d9fad69..61d0866e754 100644 --- a/.github/workflows/self-comment-ci.yml +++ b/.github/workflows/self-comment-ci.yml @@ -185,7 +185,7 @@ jobs: fail-fast: false matrix: folders: ${{ fromJson(needs.get-tests.outputs.models) }} - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] runs-on: group: '${{ matrix.machine_type }}' container: @@ -239,7 +239,7 @@ jobs: shell: bash run: | echo "${{ matrix.machine_type }}" - if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then machine_type=single-gpu elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then machine_type=multi-gpu @@ -292,7 +292,7 @@ jobs: fail-fast: false matrix: folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }} - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] runs-on: group: '${{ matrix.machine_type }}' container: @@ -338,7 +338,7 @@ jobs: shell: bash run: | echo "${{ matrix.machine_type }}" - if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then machine_type=single-gpu elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then machine_type=multi-gpu diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 7fce6d60800..1198148fd63 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -49,7 +49,7 @@ jobs: name: Setup strategy: matrix: - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] runs-on: group: '${{ matrix.machine_type }}' container: @@ -107,7 +107,7 @@ jobs: strategy: fail-fast: false matrix: - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }} uses: ./.github/workflows/model_jobs.yml with: @@ -125,7 +125,7 @@ jobs: strategy: fail-fast: false matrix: - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] slice_id: [0, 1] uses: ./.github/workflows/model_jobs.yml with: @@ -143,7 +143,7 @@ jobs: strategy: fail-fast: false matrix: - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] runs-on: group: '${{ matrix.machine_type }}' container: @@ -177,7 +177,7 @@ jobs: run: | echo "${{ matrix.machine_type }}" - if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then machine_type=single-gpu elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then machine_type=multi-gpu @@ -211,7 +211,7 @@ jobs: strategy: fail-fast: false matrix: - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] runs-on: group: '${{ matrix.machine_type }}' container: @@ -246,7 +246,7 @@ jobs: run: | echo "${{ matrix.machine_type }}" - if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then machine_type=single-gpu elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then machine_type=multi-gpu @@ -280,7 +280,7 @@ jobs: strategy: fail-fast: false matrix: - machine_type: [aws-g4dn-2xlarge-cache] + machine_type: [aws-g4dn-4xlarge-cache] runs-on: group: '${{ matrix.machine_type }}' container: @@ -314,7 +314,7 @@ jobs: run: | echo "${{ matrix.machine_type }}" - if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then machine_type=single-gpu elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then machine_type=multi-gpu @@ -349,7 +349,7 @@ jobs: strategy: fail-fast: false matrix: - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] runs-on: group: '${{ matrix.machine_type }}' container: @@ -411,7 +411,7 @@ jobs: run: | echo "${{ matrix.machine_type }}" - if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then machine_type=single-gpu elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then machine_type=multi-gpu @@ -448,7 +448,7 @@ jobs: fail-fast: false matrix: folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }} - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] runs-on: group: '${{ matrix.machine_type }}' container: @@ -491,7 +491,7 @@ jobs: run: | echo "${{ matrix.machine_type }}" - if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then machine_type=single-gpu elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then machine_type=multi-gpu diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml index e648883f191..62277363033 100644 --- a/.github/workflows/ssh-runner.yml +++ b/.github/workflows/ssh-runner.yml @@ -35,7 +35,7 @@ jobs: shell: bash run: | if [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then - echo "RUNNER=aws-g4dn-2xlarge-cache" >> $GITHUB_ENV + echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV elif [[ "${{ github.event.inputs.num_gpus }}" == "multi" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV elif [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "a10" ]]; then