Revive Nightly/Past CI (#31159)

* build

* build

* build

* build

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yih-Dar 2024-06-20 18:57:24 +02:00 committed by GitHub
parent ec905f3a76
commit d4564df1d4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 233 additions and 742 deletions

View File

@ -15,16 +15,6 @@ jobs:
name: "Nightly PyTorch + Stable TensorFlow" name: "Nightly PyTorch + Stable TensorFlow"
runs-on: [intel-cpu, 8-cpu, ci] runs-on: [intel-cpu, 8-cpu, ci]
steps: steps:
- name: Cleanup disk
run: |
sudo ls -l /usr/local/lib/
sudo ls -l /usr/share/
sudo du -sh /usr/local/lib/
sudo du -sh /usr/share/
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/share/dotnet
sudo du -sh /usr/local/lib/
sudo du -sh /usr/share/
- -
name: Set up Docker Buildx name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2 uses: docker/setup-buildx-action@v2
@ -52,16 +42,6 @@ jobs:
name: "Nightly PyTorch + DeepSpeed" name: "Nightly PyTorch + DeepSpeed"
runs-on: [intel-cpu, 8-cpu, ci] runs-on: [intel-cpu, 8-cpu, ci]
steps: steps:
- name: Cleanup disk
run: |
sudo ls -l /usr/local/lib/
sudo ls -l /usr/share/
sudo du -sh /usr/local/lib/
sudo du -sh /usr/share/
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/share/dotnet
sudo du -sh /usr/local/lib/
sudo du -sh /usr/share/
- -
name: Set up Docker Buildx name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2 uses: docker/setup-buildx-action@v2

View File

@ -12,6 +12,12 @@ on:
slice_id: slice_id:
required: true required: true
type: number type: number
runner:
required: true
type: string
docker:
required: true
type: string
env: env:
HF_HOME: /mnt/cache HF_HOME: /mnt/cache
@ -31,12 +37,13 @@ jobs:
run_models_gpu: run_models_gpu:
name: " " name: " "
strategy: strategy:
max-parallel: 8
fail-fast: false fail-fast: false
matrix: matrix:
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }} folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
runs-on: ['${{ inputs.machine_type }}', nvidia-gpu, t4, daily-ci] runs-on: ['${{ inputs.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}']
container: container:
image: huggingface/transformers-all-latest-gpu image: ${{ inputs.docker }}
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps: steps:
- name: Echo input and matrix info - name: Echo input and matrix info
@ -65,6 +72,18 @@ jobs:
working-directory: /transformers working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: Update / Install some packages (for Past CI)
if: ${{ contains(inputs.docker, '-past-') }}
working-directory: /transformers
run: |
python3 -m pip install -U datasets
- name: Update / Install some packages (for Past CI)
if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }}
working-directory: /transformers
run: |
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
- name: NVIDIA-SMI - name: NVIDIA-SMI
run: | run: |
nvidia-smi nvidia-smi

View File

@ -0,0 +1,43 @@
name: Self-hosted runner (nightly-ci)
on:
repository_dispatch:
schedule:
- cron: "17 2 * * *"
push:
branches:
- run_nightly_ci*
jobs:
build_nightly_ci_images:
name: Build Nightly CI Docker Images
if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci'))
uses: ./.github/workflows/build-nightly-ci-docker-images.yml
secrets: inherit
model-ci:
name: Model CI
needs: [build_nightly_ci_images]
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_models_gpu
slack_report_channel: "#transformers-ci-past-future"
runner: ci
docker: huggingface/transformers-all-latest-torch-nightly-gpu
ci_event: Nightly CI
secrets: inherit
deepspeed-ci:
name: DeepSpeed CI
needs: [build_nightly_ci_images]
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_torch_cuda_extensions_gpu
slack_report_channel: "#transformers-ci-past-future"
runner: ci
# test deepspeed nightly build with the latest release torch
docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
ci_event: Nightly CI
working-directory-prefix: /workspace
secrets: inherit

View File

@ -2,32 +2,30 @@ name: Self-hosted runner (nightly-past-ci-caller)
on: on:
schedule: schedule:
# 2:17 am on each Sunday and Thursday - cron: "17 2,14 * * *"
- cron: "17 2 * * 0,4"
push: push:
branches: branches:
- run_nightly_ci*
- run_past_ci* - run_past_ci*
jobs: jobs:
build_nightly_ci_images: get_number:
name: Build Nightly CI Docker Images name: Get number
if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci')) runs-on: ubuntu-22.04
uses: ./.github/workflows/build-nightly-ci-docker-images.yml outputs:
secrets: inherit run_number: ${{ steps.get_number.outputs.run_number }}
steps:
run_nightly_ci: - name: Get number
name: Nightly CI id: get_number
needs: [build_nightly_ci_images] run: |
uses: ./.github/workflows/self-nightly-scheduled.yml echo "${{ github.run_number }}"
secrets: inherit echo "$(python3 -c 'print(int(${{ github.run_number }}) % 10)')"
echo "run_number=$(python3 -c 'print(int(${{ github.run_number }}) % 10)')" >> $GITHUB_OUTPUT
run_past_ci_pytorch_1-13: run_past_ci_pytorch_1-13:
name: PyTorch 1.13 name: PyTorch 1.13
if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) needs: get_number
needs: [run_nightly_ci] if: needs.get_number.outputs.run_number == 0 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
uses: ./.github/workflows/self-past.yml uses: ./.github/workflows/self-past-caller.yml
with: with:
framework: pytorch framework: pytorch
version: "1.13" version: "1.13"
@ -36,9 +34,9 @@ jobs:
run_past_ci_pytorch_1-12: run_past_ci_pytorch_1-12:
name: PyTorch 1.12 name: PyTorch 1.12
if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) needs: get_number
needs: [run_past_ci_pytorch_1-13] if: needs.get_number.outputs.run_number == 1 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
uses: ./.github/workflows/self-past.yml uses: ./.github/workflows/self-past-caller.yml
with: with:
framework: pytorch framework: pytorch
version: "1.12" version: "1.12"
@ -47,9 +45,9 @@ jobs:
run_past_ci_pytorch_1-11: run_past_ci_pytorch_1-11:
name: PyTorch 1.11 name: PyTorch 1.11
if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) needs: get_number
needs: [run_past_ci_pytorch_1-12] if: needs.get_number.outputs.run_number == 2 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
uses: ./.github/workflows/self-past.yml uses: ./.github/workflows/self-past-caller.yml
with: with:
framework: pytorch framework: pytorch
version: "1.11" version: "1.11"
@ -58,9 +56,9 @@ jobs:
run_past_ci_tensorflow_2-11: run_past_ci_tensorflow_2-11:
name: TensorFlow 2.11 name: TensorFlow 2.11
if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) needs: get_number
needs: [run_past_ci_pytorch_1-11] if: needs.get_number.outputs.run_number == 3 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
uses: ./.github/workflows/self-past.yml uses: ./.github/workflows/self-past-caller.yml
with: with:
framework: tensorflow framework: tensorflow
version: "2.11" version: "2.11"
@ -69,9 +67,9 @@ jobs:
run_past_ci_tensorflow_2-10: run_past_ci_tensorflow_2-10:
name: TensorFlow 2.10 name: TensorFlow 2.10
if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) needs: get_number
needs: [run_past_ci_tensorflow_2-11] if: needs.get_number.outputs.run_number == 4 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
uses: ./.github/workflows/self-past.yml uses: ./.github/workflows/self-past-caller.yml
with: with:
framework: tensorflow framework: tensorflow
version: "2.10" version: "2.10"
@ -80,9 +78,9 @@ jobs:
run_past_ci_tensorflow_2-9: run_past_ci_tensorflow_2-9:
name: TensorFlow 2.9 name: TensorFlow 2.9
if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) needs: get_number
needs: [run_past_ci_tensorflow_2-10] if: needs.get_number.outputs.run_number == 5 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
uses: ./.github/workflows/self-past.yml uses: ./.github/workflows/self-past-caller.yml
with: with:
framework: tensorflow framework: tensorflow
version: "2.9" version: "2.9"
@ -91,9 +89,9 @@ jobs:
run_past_ci_tensorflow_2-8: run_past_ci_tensorflow_2-8:
name: TensorFlow 2.8 name: TensorFlow 2.8
if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) needs: get_number
needs: [run_past_ci_tensorflow_2-9] if: needs.get_number.outputs.run_number == 6 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
uses: ./.github/workflows/self-past.yml uses: ./.github/workflows/self-past-caller.yml
with: with:
framework: tensorflow framework: tensorflow
version: "2.8" version: "2.8"
@ -102,9 +100,9 @@ jobs:
run_past_ci_tensorflow_2-7: run_past_ci_tensorflow_2-7:
name: TensorFlow 2.7 name: TensorFlow 2.7
if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) needs: get_number
needs: [run_past_ci_tensorflow_2-8] if: needs.get_number.outputs.run_number == 7 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
uses: ./.github/workflows/self-past.yml uses: ./.github/workflows/self-past-caller.yml
with: with:
framework: tensorflow framework: tensorflow
version: "2.7" version: "2.7"
@ -113,9 +111,9 @@ jobs:
run_past_ci_tensorflow_2-6: run_past_ci_tensorflow_2-6:
name: TensorFlow 2.6 name: TensorFlow 2.6
if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) needs: get_number
needs: [run_past_ci_tensorflow_2-7] if: needs.get_number.outputs.run_number == 8 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
uses: ./.github/workflows/self-past.yml uses: ./.github/workflows/self-past-caller.yml
with: with:
framework: tensorflow framework: tensorflow
version: "2.6" version: "2.6"
@ -124,9 +122,9 @@ jobs:
run_past_ci_tensorflow_2-5: run_past_ci_tensorflow_2-5:
name: TensorFlow 2.5 name: TensorFlow 2.5
if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) needs: get_number
needs: [run_past_ci_tensorflow_2-6] if: needs.get_number.outputs.run_number == 9 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
uses: ./.github/workflows/self-past.yml uses: ./.github/workflows/self-past-caller.yml
with: with:
framework: tensorflow framework: tensorflow
version: "2.5" version: "2.5"

View File

@ -1,290 +0,0 @@
name: Self-hosted runner (nightly-ci)
# Note that each job's dependencies go into a corresponding docker file.
#
# For example for `run_torch_cuda_extensions_gpu` the docker image is
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
on:
repository_dispatch:
workflow_call:
env:
HF_HOME: /mnt/cache
TRANSFORMERS_IS_CI: yes
OMP_NUM_THREADS: 8
MKL_NUM_THREADS: 8
RUN_SLOW: yes
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
TF_FORCE_GPU_ALLOW_GROWTH: true
RUN_PT_TF_CROSS_TESTS: 1
CUDA_VISIBLE_DEVICES: 0,1
jobs:
setup:
name: Setup
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
container:
image: huggingface/transformers-all-latest-torch-nightly-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- name: Update clone
working-directory: /transformers
run: |
git fetch && git checkout ${{ github.sha }}
- name: Cleanup
working-directory: /transformers
run: |
rm -rf tests/__pycache__
rm -rf tests/models/__pycache__
rm -rf reports
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- id: set-matrix
name: Identify models to test
working-directory: /transformers/tests
run: |
echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT
- name: NVIDIA-SMI
run: |
nvidia-smi
run_tests_single_gpu:
name: Model tests
strategy:
fail-fast: false
matrix:
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
machine_type: [single-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
container:
image: huggingface/transformers-all-latest-torch-nightly-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: setup
steps:
- name: Echo folder ${{ matrix.folders }}
shell: bash
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
# set the artifact folder names (because the character `/` is not allowed).
run: |
echo "${{ matrix.folders }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'models/'/'models_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Run all tests on GPU
working-directory: /transformers
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
run_tests_multi_gpu:
name: Model tests
strategy:
fail-fast: false
matrix:
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
machine_type: [multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
container:
image: huggingface/transformers-all-latest-torch-nightly-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: setup
steps:
- name: Echo folder ${{ matrix.folders }}
shell: bash
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
# set the artifact folder names (because the character `/` is not allowed).
run: |
echo "${{ matrix.folders }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'models/'/'models_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Run all tests on GPU
working-directory: /transformers
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
run_torch_cuda_extensions_gpu:
name: Torch CUDA extension tests
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
needs: setup
container:
image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Update clone
working-directory: /workspace/transformers
run: git fetch && git checkout ${{ github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /workspace/transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: Remove cached torch extensions
run: rm -rf /github/home/.cache/torch_extensions/
# To avoid unknown test failures
- name: Pre build DeepSpeed *again*
working-directory: /workspace
run: |
python3 -m pip uninstall -y deepspeed
rm -rf DeepSpeed
git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /workspace/transformers
run: |
python utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /workspace/transformers
run: pip freeze
- name: Run all tests on GPU
working-directory: /workspace/transformers
run: |
python -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports_postfix_nightly"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports_postfix_nightly
path: /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
send_results:
name: Send results to webhook
runs-on: ubuntu-22.04
if: always()
needs: [
setup,
run_tests_single_gpu,
run_tests_multi_gpu,
run_torch_cuda_extensions_gpu
]
steps:
- name: Preliminary job status
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
echo "Setup status: ${{ needs.setup.result }}"
- uses: actions/checkout@v4
- uses: actions/download-artifact@v4
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
CI_EVENT: Nightly CI
SETUP_STATUS: ${{ needs.setup.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
pip install slack_sdk
pip show slack_sdk
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
# delete-artifact
- uses: geekyeggo/delete-artifact@v2
with:
name: |
single-*
multi-*

40
.github/workflows/self-past-caller.yml vendored Normal file
View File

@ -0,0 +1,40 @@
name: Self-hosted runner (past-ci)
on:
workflow_call:
inputs:
framework:
required: true
type: string
version:
required: true
type: string
# Use this to control the commit to test against
sha:
default: 'main'
required: false
type: string
jobs:
model-ci:
name: Model CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_models_gpu
slack_report_channel: "#transformers-ci-past-future"
runner: past-ci
docker: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
ci_event: Past CI - ${{ inputs.framework }}-${{ inputs.version }}
secrets: inherit
deepspeed-ci:
name: DeepSpeed CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_torch_cuda_extensions_gpu
slack_report_channel: "#transformers-ci-past-future"
runner: past-ci
docker: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
ci_event: Past CI - ${{ inputs.framework }}-${{ inputs.version }}
secrets: inherit

View File

@ -1,357 +0,0 @@
name: Self-hosted runner (past-ci)
# Note that each job's dependencies go into a corresponding docker file.
#
# For example for `run_torch_cuda_extensions_gpu` the docker image is
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
on:
workflow_call:
inputs:
framework:
required: true
type: string
version:
required: true
type: string
# Use this to control the commit to test against
sha:
default: 'main'
required: false
type: string
env:
HF_HOME: /mnt/cache
TRANSFORMERS_IS_CI: yes
OMP_NUM_THREADS: 8
MKL_NUM_THREADS: 8
RUN_SLOW: yes
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
TF_FORCE_GPU_ALLOW_GROWTH: true
RUN_PT_TF_CROSS_TESTS: 1
CUDA_VISIBLE_DEVICES: 0,1
jobs:
setup:
name: Setup
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ inputs.sha }}
- name: Cleanup
working-directory: /transformers
run: |
rm -rf tests/__pycache__
rm -rf tests/models/__pycache__
rm -rf reports
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- id: set-matrix
working-directory: /transformers
name: Identify models to test
run: |
cd tests
echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT
run_tests_single_gpu:
name: Model tests
strategy:
fail-fast: false
matrix:
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
machine_type: [single-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: setup
steps:
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ inputs.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: Update some packages
working-directory: /transformers
run: python3 -m pip install -U datasets
- name: Echo folder ${{ matrix.folders }}
shell: bash
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
# set the artifact folder names (because the character `/` is not allowed).
run: |
echo "${{ matrix.folders }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'models/'/'models_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Install
if: inputs.framework == 'pytorch'
working-directory: /transformers
run: |
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Run all tests on GPU
working-directory: /transformers
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
- name: Save job name
if: ${{ always() }}
shell: bash
run: |
matrix_folders=${matrix_folders/'models_'/'models/'}
job_name="Model tests ($matrix_folders, ${{ matrix.machine_type }})"
echo "$job_name"
echo "$job_name" > /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/job_name.txt
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
run_tests_multi_gpu:
name: Model tests
strategy:
fail-fast: false
matrix:
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
machine_type: [multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: setup
steps:
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ inputs.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: Update some packages
working-directory: /transformers
run: python3 -m pip install -U datasets
- name: Echo folder ${{ matrix.folders }}
shell: bash
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
# set the artifact folder names (because the character `/` is not allowed).
run: |
echo "${{ matrix.folders }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'models/'/'models_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Install
if: inputs.framework == 'pytorch'
working-directory: /transformers
run: |
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Run all tests on GPU
working-directory: /transformers
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
- name: Save job name
if: ${{ always() }}
shell: bash
run: |
matrix_folders=${matrix_folders/'models_'/'models/'}
job_name="Model tests ($matrix_folders, ${{ matrix.machine_type }})"
echo "$job_name"
echo "$job_name" > /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/job_name.txt
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
run_torch_cuda_extensions_gpu:
name: Torch CUDA extension tests
if: inputs.framework == 'pytorch'
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
needs: setup
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: Update some packages
working-directory: /transformers
run: python3 -m pip install -U datasets
- name: Install
working-directory: /transformers
run: |
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
- name: Remove cached torch extensions
run: rm -rf /github/home/.cache/torch_extensions/
# To avoid unknown test failures
- name: Pre build DeepSpeed *again*
working-directory: /
run: |
python3 -m pip uninstall -y deepspeed
rm -rf DeepSpeed
git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Run all tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}
path: /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
send_results:
name: Send results to webhook
runs-on: ubuntu-22.04
if: always()
needs: [
setup,
run_tests_single_gpu,
run_tests_multi_gpu,
run_torch_cuda_extensions_gpu
]
steps:
- name: Preliminary job status
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
echo "Setup status: ${{ needs.setup.result }}"
- uses: actions/checkout@v4
- uses: actions/download-artifact@v4
# Create a directory to store test failure tables in the next step
- name: Create directory
run: mkdir test_failure_tables
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }}
SETUP_STATUS: ${{ needs.setup.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
pip install slack_sdk
pip show slack_sdk
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
- name: Failure table artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: test_failure_tables_${{ inputs.framework }}-${{ inputs.version }}
path: test_failure_tables
# delete-artifact
- uses: geekyeggo/delete-artifact@v2
with:
name: |
single-*
multi-*

View File

@ -16,6 +16,9 @@ jobs:
with: with:
job: run_models_gpu job: run_models_gpu
slack_report_channel: "#transformers-ci-daily-models" slack_report_channel: "#transformers-ci-daily-models"
runner: daily-ci
docker: huggingface/transformers-all-latest-gpu
ci_event: Daily CI
secrets: inherit secrets: inherit
torch-pipeline: torch-pipeline:
@ -24,6 +27,9 @@ jobs:
with: with:
job: run_pipelines_torch_gpu job: run_pipelines_torch_gpu
slack_report_channel: "#transformers-ci-daily-pipeline-torch" slack_report_channel: "#transformers-ci-daily-pipeline-torch"
runner: daily-ci
docker: huggingface/transformers-pytorch-gpu
ci_event: Daily CI
secrets: inherit secrets: inherit
tf-pipeline: tf-pipeline:
@ -32,6 +38,9 @@ jobs:
with: with:
job: run_pipelines_tf_gpu job: run_pipelines_tf_gpu
slack_report_channel: "#transformers-ci-daily-pipeline-tf" slack_report_channel: "#transformers-ci-daily-pipeline-tf"
runner: daily-ci
docker: huggingface/transformers-tensorflow-gpu
ci_event: Daily CI
secrets: inherit secrets: inherit
example-ci: example-ci:
@ -40,6 +49,9 @@ jobs:
with: with:
job: run_examples_gpu job: run_examples_gpu
slack_report_channel: "#transformers-ci-daily-examples" slack_report_channel: "#transformers-ci-daily-examples"
runner: daily-ci
docker: huggingface/transformers-all-latest-gpu
ci_event: Daily CI
secrets: inherit secrets: inherit
deepspeed-ci: deepspeed-ci:
@ -48,6 +60,10 @@ jobs:
with: with:
job: run_torch_cuda_extensions_gpu job: run_torch_cuda_extensions_gpu
slack_report_channel: "#transformers-ci-daily-deepspeed" slack_report_channel: "#transformers-ci-daily-deepspeed"
runner: daily-ci
docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
ci_event: Daily CI
working-directory-prefix: /workspace
secrets: inherit secrets: inherit
quantization-ci: quantization-ci:
@ -56,4 +72,7 @@ jobs:
with: with:
job: run_quantization_torch_gpu job: run_quantization_torch_gpu
slack_report_channel: "#transformers-ci-daily-quantization" slack_report_channel: "#transformers-ci-daily-quantization"
runner: daily-ci
docker: huggingface/transformers-quantization-latest-gpu
ci_event: Daily CI
secrets: inherit secrets: inherit

View File

@ -15,6 +15,19 @@ on:
slack_report_channel: slack_report_channel:
required: true required: true
type: string type: string
runner:
required: true
type: string
docker:
required: true
type: string
ci_event:
required: true
type: string
working-directory-prefix:
default: ''
required: false
type: string
env: env:
HF_HOME: /mnt/cache HF_HOME: /mnt/cache
@ -38,7 +51,7 @@ jobs:
strategy: strategy:
matrix: matrix:
machine_type: [single-gpu, multi-gpu] machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}']
container: container:
image: huggingface/transformers-all-latest-gpu image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -96,6 +109,8 @@ jobs:
folder_slices: ${{ needs.setup.outputs.folder_slices }} folder_slices: ${{ needs.setup.outputs.folder_slices }}
machine_type: ${{ matrix.machine_type }} machine_type: ${{ matrix.machine_type }}
slice_id: ${{ matrix.slice_id }} slice_id: ${{ matrix.slice_id }}
runner: ${{ inputs.runner }}
docker: ${{ inputs.docker }}
secrets: inherit secrets: inherit
run_pipelines_torch_gpu: run_pipelines_torch_gpu:
@ -105,7 +120,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
machine_type: [single-gpu, multi-gpu] machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}']
container: container:
image: huggingface/transformers-pytorch-gpu image: huggingface/transformers-pytorch-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -155,7 +170,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
machine_type: [single-gpu, multi-gpu] machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}']
container: container:
image: huggingface/transformers-tensorflow-gpu image: huggingface/transformers-tensorflow-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -206,7 +221,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
machine_type: [single-gpu] machine_type: [single-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}']
container: container:
image: huggingface/transformers-all-latest-gpu image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -257,69 +272,88 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
machine_type: [single-gpu, multi-gpu] machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}']
container: container:
image: huggingface/transformers-pytorch-deepspeed-latest-gpu image: ${{ inputs.docker }}
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps: steps:
- name: Update clone - name: Update clone
working-directory: /workspace/transformers working-directory: ${{ inputs.working-directory-prefix }}/transformers
run: git fetch && git checkout ${{ github.sha }} run: git fetch && git checkout ${{ github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build) - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /workspace/transformers working-directory: ${{ inputs.working-directory-prefix }}/transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: Update / Install some packages (for Past CI)
if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }}
working-directory: ${{ inputs.working-directory-prefix }}/transformers
run: |
python3 -m pip install -U datasets
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
- name: Remove cached torch extensions - name: Remove cached torch extensions
run: rm -rf /github/home/.cache/torch_extensions/ run: rm -rf /github/home/.cache/torch_extensions/
# To avoid unknown test failures # To avoid unknown test failures
- name: Pre build DeepSpeed *again* - name: Pre build DeepSpeed *again* (for daily CI)
working-directory: /workspace if: ${{ contains(inputs.ci_event, 'Daily CI') }}
working-directory: ${{ inputs.working-directory-prefix }}/
run: | run: |
python3 -m pip uninstall -y deepspeed python3 -m pip uninstall -y deepspeed
DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
# To avoid unknown test failures
- name: Pre build DeepSpeed *again* (for nightly & Past CI)
if: ${{ contains(inputs.ci_event, 'Nightly CI') || contains(inputs.ci_event, 'Past CI') }}
working-directory: ${{ inputs.working-directory-prefix }}/
run: |
python3 -m pip uninstall -y deepspeed
rm -rf DeepSpeed
git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
- name: NVIDIA-SMI - name: NVIDIA-SMI
run: | run: |
nvidia-smi nvidia-smi
- name: Environment - name: Environment
working-directory: /workspace/transformers working-directory: ${{ inputs.working-directory-prefix }}/transformers
run: | run: |
python utils/print_env.py python3 utils/print_env.py
- name: Show installed libraries and their versions - name: Show installed libraries and their versions
working-directory: /workspace/transformers working-directory: ${{ inputs.working-directory-prefix }}/transformers
run: pip freeze run: pip freeze
- name: Run all tests on GPU - name: Run all tests on GPU
working-directory: /workspace/transformers working-directory: ${{ inputs.working-directory-prefix }}/transformers
run: | run: |
python -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
- name: Failure short reports - name: Failure short reports
if: ${{ failure() }} if: ${{ failure() }}
continue-on-error: true continue-on-error: true
run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt run: cat ${{ inputs.working-directory-prefix }}/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports" - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
if: ${{ always() }} if: ${{ always() }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
path: /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports path: ${{ inputs.working-directory-prefix }}/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
run_quantization_torch_gpu: run_quantization_torch_gpu:
if: ${{ inputs.job == 'run_quantization_torch_gpu' }} if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
name: " " name: " "
needs: setup needs: setup
strategy: strategy:
max-parallel: 4
fail-fast: false fail-fast: false
matrix: matrix:
folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }} folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
machine_type: [single-gpu, multi-gpu] machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}']
container: container:
image: huggingface/transformers-quantization-latest-gpu image: huggingface/transformers-quantization-latest-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -434,5 +468,6 @@ jobs:
# This would be an empty string if `setup` is skipped. # This would be an empty string if `setup` is skipped.
folder_slices: ${{ needs.setup.outputs.folder_slices }} folder_slices: ${{ needs.setup.outputs.folder_slices }}
quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }} quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
ci_event: ${{ inputs.ci_event }}
secrets: inherit secrets: inherit

View File

@ -18,6 +18,9 @@ on:
quantization_matrix: quantization_matrix:
required: true required: true
type: string type: string
ci_event:
required: true
type: string
env: env:
TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
@ -45,7 +48,7 @@ jobs:
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }} SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
CI_EVENT: scheduled CI_EVENT: ${{ inputs.ci_event }}
CI_SHA: ${{ github.sha }} CI_SHA: ${{ github.sha }}
CI_WORKFLOW_REF: ${{ github.workflow_ref }} CI_WORKFLOW_REF: ${{ github.workflow_ref }}
CI_TEST_JOB: ${{ inputs.job }} CI_TEST_JOB: ${{ inputs.job }}
@ -76,7 +79,7 @@ jobs:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }} SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
CI_EVENT: scheduled CI_EVENT: ${{ inputs.ci_event }}
CI_SHA: ${{ github.sha }} CI_SHA: ${{ github.sha }}
CI_TEST_JOB: ${{ inputs.job }} CI_TEST_JOB: ${{ inputs.job }}
SETUP_STATUS: ${{ inputs.setup_status }} SETUP_STATUS: ${{ inputs.setup_status }}

View File

@ -641,7 +641,7 @@ class Message:
def get_new_model_failure_blocks(self, with_header=True, to_truncate=True): def get_new_model_failure_blocks(self, with_header=True, to_truncate=True):
if self.prev_ci_artifacts is None: if self.prev_ci_artifacts is None:
return {} return []
sorted_dict = sorted(self.model_results.items(), key=lambda t: t[0]) sorted_dict = sorted(self.model_results.items(), key=lambda t: t[0])
@ -767,10 +767,11 @@ class Message:
# To save the list of new model failures # To save the list of new model failures
blocks = self.get_new_model_failure_blocks(to_truncate=False) blocks = self.get_new_model_failure_blocks(to_truncate=False)
failure_text = blocks[-1]["text"]["text"] if blocks:
file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/new_model_failures.txt") failure_text = blocks[-1]["text"]["text"]
with open(file_path, "w", encoding="UTF-8") as fp: file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/new_model_failures.txt")
fp.write(failure_text) with open(file_path, "w", encoding="UTF-8") as fp:
fp.write(failure_text)
def retrieve_artifact(artifact_path: str, gpu: Optional[str]): def retrieve_artifact(artifact_path: str, gpu: Optional[str]):
@ -891,7 +892,7 @@ if __name__ == "__main__":
# To find the PR number in a commit title, for example, `Add AwesomeFormer model (#99999)` # To find the PR number in a commit title, for example, `Add AwesomeFormer model (#99999)`
pr_number_re = re.compile(r"\(#(\d+)\)$") pr_number_re = re.compile(r"\(#(\d+)\)$")
title = f"🤗 Results of the {ci_event} tests." title = f"🤗 Results of {ci_event} - {os.getenv('CI_TEST_JOB')}."
# Add Commit/PR title with a link for push CI # Add Commit/PR title with a link for push CI
# (check the title in 2 env. variables - depending on the CI is triggered via `push` or `workflow_run` event) # (check the title in 2 env. variables - depending on the CI is triggered via `push` or `workflow_run` event)
ci_title_push = os.environ.get("CI_TITLE_PUSH") ci_title_push = os.environ.get("CI_TITLE_PUSH")

View File

@ -175,7 +175,7 @@ if __name__ == "__main__":
# This env. variable is set in workflow file (under the job `send_results`). # This env. variable is set in workflow file (under the job `send_results`).
ci_event = os.environ["CI_EVENT"] ci_event = os.environ["CI_EVENT"]
title = f"🤗 Results of the {ci_event} tests." title = f"🤗 Results of the {ci_event} - {os.getenv('CI_TEST_JOB')}."
if setup_failed: if setup_failed:
Message.error_out( Message.error_out(