mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-01 02:31:11 +06:00
Enable PyTorch nightly build CI (#17335)
* nightly build pytorch CI * fix working dir * change time and event name Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
parent
3c7e56fbb1
commit
ca169dbdf1
53
.github/workflows/build-docker-images.yml
vendored
53
.github/workflows/build-docker-images.yml
vendored
@ -39,6 +39,33 @@ jobs:
|
|||||||
push: true
|
push: true
|
||||||
tags: huggingface/transformers-all-latest-gpu
|
tags: huggingface/transformers-all-latest-gpu
|
||||||
|
|
||||||
|
latest-with-torch-nightly-docker:
|
||||||
|
name: "Nightly PyTorch + Stable TensorFlow"
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
-
|
||||||
|
name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v1
|
||||||
|
-
|
||||||
|
name: Check out code
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
-
|
||||||
|
name: Login to DockerHub
|
||||||
|
uses: docker/login-action@v1
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||||
|
-
|
||||||
|
name: Build and push
|
||||||
|
uses: docker/build-push-action@v2
|
||||||
|
with:
|
||||||
|
context: ./docker/transformers-all-latest-gpu
|
||||||
|
build-args: |
|
||||||
|
REF=main
|
||||||
|
PYTORCH=pre
|
||||||
|
push: true
|
||||||
|
tags: huggingface/transformers-all-latest-torch-nightly-gpu
|
||||||
|
|
||||||
latest-torch-deepspeed-docker:
|
latest-torch-deepspeed-docker:
|
||||||
name: "Latest PyTorch + DeepSpeed"
|
name: "Latest PyTorch + DeepSpeed"
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@ -65,6 +92,32 @@ jobs:
|
|||||||
push: true
|
push: true
|
||||||
tags: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
tags: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||||
|
|
||||||
|
nightly-torch-deepspeed-docker:
|
||||||
|
name: "Nightly PyTorch + DeepSpeed"
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
-
|
||||||
|
name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v1
|
||||||
|
-
|
||||||
|
name: Check out code
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
-
|
||||||
|
name: Login to DockerHub
|
||||||
|
uses: docker/login-action@v1
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||||
|
-
|
||||||
|
name: Build and push
|
||||||
|
uses: docker/build-push-action@v2
|
||||||
|
with:
|
||||||
|
context: ./docker/transformers-pytorch-deepspeed-nightly-gpu
|
||||||
|
build-args: |
|
||||||
|
REF=main
|
||||||
|
push: true
|
||||||
|
tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu
|
||||||
|
|
||||||
doc-builder:
|
doc-builder:
|
||||||
name: "Doc builder"
|
name: "Doc builder"
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
424
.github/workflows/self-nightly-scheduled.yml
vendored
424
.github/workflows/self-nightly-scheduled.yml
vendored
@ -1,250 +1,236 @@
|
|||||||
name: Self-hosted runner; Nightly (scheduled)
|
name: Self-hosted runner (nightly)
|
||||||
|
|
||||||
|
# Note that each job's dependencies go into a corresponding docker file.
|
||||||
|
#
|
||||||
|
# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
|
||||||
|
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
|
||||||
|
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
repository_dispatch:
|
||||||
branches:
|
schedule:
|
||||||
- nightly_ci*
|
- cron: "0 16 * * *"
|
||||||
repository_dispatch:
|
|
||||||
schedule:
|
|
||||||
- cron: "0 0 */3 * *"
|
|
||||||
|
|
||||||
env:
|
env:
|
||||||
HF_HOME: /mnt/cache
|
HF_HOME: /mnt/cache
|
||||||
TRANSFORMERS_IS_CI: yes
|
TRANSFORMERS_IS_CI: yes
|
||||||
RUN_SLOW: yes
|
OMP_NUM_THREADS: 8
|
||||||
OMP_NUM_THREADS: 16
|
MKL_NUM_THREADS: 8
|
||||||
MKL_NUM_THREADS: 16
|
RUN_SLOW: yes
|
||||||
PYTEST_TIMEOUT: 600
|
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||||
|
RUN_PT_TF_CROSS_TESTS: 1
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
run_all_tests_torch_gpu:
|
setup:
|
||||||
runs-on: [self-hosted, docker-gpu, single-gpu]
|
name: Setup
|
||||||
container:
|
strategy:
|
||||||
image: pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
|
matrix:
|
||||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
machine_type: [single-gpu, multi-gpu]
|
||||||
steps:
|
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||||
- name: Launcher docker
|
container:
|
||||||
uses: actions/checkout@v2
|
image: huggingface/transformers-all-latest-torch-nightly-gpu
|
||||||
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
outputs:
|
||||||
|
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||||
|
steps:
|
||||||
|
- name: Update clone
|
||||||
|
working-directory: /transformers
|
||||||
|
run: |
|
||||||
|
git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
- name: NVIDIA-SMI
|
- name: Cleanup
|
||||||
run: |
|
working-directory: /transformers
|
||||||
nvidia-smi
|
run: |
|
||||||
|
rm -rf tests/__pycache__
|
||||||
|
rm -rf tests/models/__pycache__
|
||||||
|
rm -rf reports
|
||||||
|
|
||||||
- name: Install dependencies
|
- id: set-matrix
|
||||||
run: |
|
name: Identify models to test
|
||||||
apt -y update && apt install -y libsndfile1-dev git espeak-ng
|
working-directory: /transformers/tests
|
||||||
pip install --upgrade pip
|
run: |
|
||||||
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')"
|
||||||
pip install https://github.com/kpu/kenlm/archive/master.zip
|
|
||||||
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
|
|
||||||
|
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
- name: NVIDIA-SMI
|
||||||
run: |
|
run: |
|
||||||
utils/print_env.py
|
nvidia-smi
|
||||||
|
|
||||||
- name: Run all tests on GPU
|
run_tests_single_gpu:
|
||||||
run: |
|
name: Model tests
|
||||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||||
|
machine_type: [single-gpu]
|
||||||
|
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||||
|
container:
|
||||||
|
image: huggingface/transformers-all-latest-torch-nightly-gpu
|
||||||
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
needs: setup
|
||||||
|
steps:
|
||||||
|
- name: Echo folder ${{ matrix.folders }}
|
||||||
|
shell: bash
|
||||||
|
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||||
|
# set the artifact folder names (because the character `/` is not allowed).
|
||||||
|
run: |
|
||||||
|
echo "${{ matrix.folders }}"
|
||||||
|
matrix_folders=${{ matrix.folders }}
|
||||||
|
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||||
|
echo "$matrix_folders"
|
||||||
|
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Update clone
|
||||||
if: ${{ always() }}
|
working-directory: /transformers
|
||||||
run: cat reports/tests_torch_gpu/failures_short.txt
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
- name: Run examples tests on GPU
|
- name: NVIDIA-SMI
|
||||||
if: ${{ always() }}
|
run: |
|
||||||
env:
|
nvidia-smi
|
||||||
OMP_NUM_THREADS: 16
|
|
||||||
MKL_NUM_THREADS: 16
|
|
||||||
RUN_SLOW: yes
|
|
||||||
HF_HOME: /mnt/cache
|
|
||||||
TRANSFORMERS_IS_CI: yes
|
|
||||||
run: |
|
|
||||||
pip install -r examples/pytorch/_tests_requirements.txt
|
|
||||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples
|
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Environment
|
||||||
if: ${{ always() }}
|
working-directory: /transformers
|
||||||
run: cat reports/examples_torch_gpu/failures_short.txt
|
run: |
|
||||||
|
python3 utils/print_env.py
|
||||||
|
|
||||||
- name: Run all pipeline tests on GPU
|
- name: Run all tests on GPU
|
||||||
if: ${{ always() }}
|
working-directory: /transformers
|
||||||
env:
|
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
|
||||||
RUN_PIPELINE_TESTS: yes
|
|
||||||
run: |
|
|
||||||
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
|
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
if: ${{ always() }}
|
if: ${{ failure() }}
|
||||||
run: cat reports/tests_torch_pipeline_gpu/failures_short.txt
|
continue-on-error: true
|
||||||
|
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
||||||
|
|
||||||
- name: Test suite reports artifacts
|
- name: Test suite reports artifacts
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
uses: actions/upload-artifact@v2
|
uses: actions/upload-artifact@v2
|
||||||
with:
|
with:
|
||||||
name: run_all_tests_torch_gpu_test_reports
|
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
|
||||||
path: reports
|
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
|
||||||
|
|
||||||
run_all_tests_torch_multi_gpu:
|
run_tests_multi_gpu:
|
||||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
name: Model tests
|
||||||
container:
|
strategy:
|
||||||
image: pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
|
fail-fast: false
|
||||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
matrix:
|
||||||
steps:
|
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||||
- name: Launcher docker
|
machine_type: [multi-gpu]
|
||||||
uses: actions/checkout@v2
|
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||||
|
container:
|
||||||
|
image: huggingface/transformers-all-latest-torch-nightly-gpu
|
||||||
|
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
needs: setup
|
||||||
|
steps:
|
||||||
|
- name: Echo folder ${{ matrix.folders }}
|
||||||
|
shell: bash
|
||||||
|
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||||
|
# set the artifact folder names (because the character `/` is not allowed).
|
||||||
|
run: |
|
||||||
|
echo "${{ matrix.folders }}"
|
||||||
|
matrix_folders=${{ matrix.folders }}
|
||||||
|
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||||
|
echo "$matrix_folders"
|
||||||
|
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||||
|
|
||||||
- name: NVIDIA-SMI
|
- name: Update clone
|
||||||
continue-on-error: true
|
working-directory: /transformers
|
||||||
run: |
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
nvidia-smi
|
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: NVIDIA-SMI
|
||||||
run: |
|
run: |
|
||||||
apt -y update && apt install -y libsndfile1-dev git espeak-ng
|
nvidia-smi
|
||||||
pip install --upgrade pip
|
|
||||||
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
|
||||||
pip install https://github.com/kpu/kenlm/archive/master.zip
|
|
||||||
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
|
|
||||||
|
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
- name: Environment
|
||||||
run: |
|
working-directory: /transformers
|
||||||
utils/print_env.py
|
run: |
|
||||||
|
python3 utils/print_env.py
|
||||||
|
|
||||||
- name: Run all tests on GPU
|
- name: Run all tests on GPU
|
||||||
env:
|
working-directory: /transformers
|
||||||
MKL_SERVICE_FORCE_INTEL: 1
|
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
|
||||||
run: |
|
|
||||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests
|
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
if: ${{ always() }}
|
if: ${{ failure() }}
|
||||||
run: cat reports/tests_torch_multi_gpu/failures_short.txt
|
continue-on-error: true
|
||||||
|
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
||||||
|
|
||||||
- name: Run all pipeline tests on GPU
|
- name: Test suite reports artifacts
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
env:
|
uses: actions/upload-artifact@v2
|
||||||
RUN_PIPELINE_TESTS: yes
|
with:
|
||||||
run: |
|
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
|
||||||
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
|
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
|
||||||
|
|
||||||
- name: Failure short reports
|
run_all_tests_torch_cuda_extensions_gpu:
|
||||||
if: ${{ always() }}
|
name: Torch CUDA extension tests
|
||||||
run: cat reports/tests_torch_pipeline_multi_gpu/failures_short.txt
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
machine_type: [single-gpu, multi-gpu]
|
||||||
|
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||||
|
needs: setup
|
||||||
|
container:
|
||||||
|
image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
|
||||||
|
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
steps:
|
||||||
|
- name: Update clone
|
||||||
|
working-directory: /workspace/transformers
|
||||||
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
- name: Test suite reports artifacts
|
# To avoid unknown test failures
|
||||||
if: ${{ always() }}
|
- name: Pre build DeepSpeed *again*
|
||||||
uses: actions/upload-artifact@v2
|
working-directory: /workspace
|
||||||
with:
|
run: |
|
||||||
name: run_all_tests_torch_multi_gpu_test_reports
|
python3 -m pip uninstall -y deepspeed
|
||||||
path: reports
|
rm -rf DeepSpeed
|
||||||
|
git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
|
||||||
|
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||||
|
|
||||||
run_all_tests_torch_cuda_extensions_gpu:
|
- name: NVIDIA-SMI
|
||||||
runs-on: [self-hosted, docker-gpu, single-gpu]
|
run: |
|
||||||
container:
|
nvidia-smi
|
||||||
image: nvcr.io/nvidia/pytorch:21.03-py3
|
|
||||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
|
||||||
steps:
|
|
||||||
- name: Launcher docker
|
|
||||||
uses: actions/checkout@v2
|
|
||||||
|
|
||||||
- name: NVIDIA-SMI
|
- name: Environment
|
||||||
run: |
|
working-directory: /workspace/transformers
|
||||||
nvidia-smi
|
run: |
|
||||||
|
python utils/print_env.py
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Run all tests on GPU
|
||||||
run: |
|
working-directory: /workspace/transformers
|
||||||
apt -y update && apt install -y libaio-dev libsndfile1-dev git espeak-ng
|
run: |
|
||||||
pip install --upgrade pip
|
python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
|
||||||
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
|
|
||||||
pip install .[deepspeed-testing]
|
|
||||||
pip install https://github.com/kpu/kenlm/archive/master.zip
|
|
||||||
pip install git+https://github.com/microsoft/DeepSpeed
|
|
||||||
|
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
- name: Failure short reports
|
||||||
run: |
|
if: ${{ failure() }}
|
||||||
utils/print_env.py
|
continue-on-error: true
|
||||||
|
run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
|
||||||
|
|
||||||
- name: Run all tests on GPU
|
- name: Test suite reports artifacts
|
||||||
run: |
|
if: ${{ always() }}
|
||||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
|
uses: actions/upload-artifact@v2
|
||||||
|
with:
|
||||||
|
name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
|
||||||
|
path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
|
||||||
|
|
||||||
- name: Failure short reports
|
send_results:
|
||||||
if: ${{ always() }}
|
name: Send results to webhook
|
||||||
run: cat reports/tests_torch_cuda_extensions_gpu/failures_short.txt
|
runs-on: ubuntu-latest
|
||||||
|
if: always()
|
||||||
- name: Test suite reports artifacts
|
needs: [setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu]
|
||||||
if: ${{ always() }}
|
steps:
|
||||||
uses: actions/upload-artifact@v2
|
- uses: actions/checkout@v2
|
||||||
with:
|
- uses: actions/download-artifact@v2
|
||||||
name: run_tests_torch_cuda_extensions_gpu_test_reports
|
- name: Send message to Slack
|
||||||
path: reports
|
env:
|
||||||
|
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||||
run_all_tests_torch_cuda_extensions_multi_gpu:
|
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
||||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||||
container:
|
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||||
image: nvcr.io/nvidia/pytorch:21.03-py3
|
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
|
||||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
CI_EVENT: nightly-build
|
||||||
steps:
|
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||||
- name: Launcher docker
|
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||||
uses: actions/checkout@v2
|
run: |
|
||||||
|
pip install slack_sdk
|
||||||
- name: NVIDIA-SMI
|
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
nvidia-smi
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
apt -y update && apt install -y libaio-dev libsndfile1-dev git espeak-ng
|
|
||||||
pip install --upgrade pip
|
|
||||||
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
|
|
||||||
rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
|
|
||||||
pip install .[testing,fairscale]
|
|
||||||
pip install https://github.com/kpu/kenlm/archive/master.zip
|
|
||||||
pip install git+https://github.com/microsoft/DeepSpeed # testing bleeding edge
|
|
||||||
|
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
|
||||||
run: |
|
|
||||||
utils/print_env.py
|
|
||||||
|
|
||||||
- name: Run all tests on GPU
|
|
||||||
run: |
|
|
||||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
|
|
||||||
|
|
||||||
- name: Failure short reports
|
|
||||||
if: ${{ always() }}
|
|
||||||
run: cat reports/tests_torch_cuda_extensions_multi_gpu/failures_short.txt
|
|
||||||
|
|
||||||
- name: Test suite reports artifacts
|
|
||||||
if: ${{ always() }}
|
|
||||||
uses: actions/upload-artifact@v2
|
|
||||||
with:
|
|
||||||
name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
|
|
||||||
path: reports
|
|
||||||
|
|
||||||
send_results:
|
|
||||||
name: Send results to webhook
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: always()
|
|
||||||
needs: [
|
|
||||||
run_all_tests_torch_gpu,
|
|
||||||
run_all_tests_torch_multi_gpu,
|
|
||||||
run_all_tests_torch_cuda_extensions_gpu,
|
|
||||||
run_all_tests_torch_cuda_extensions_multi_gpu
|
|
||||||
]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v2
|
|
||||||
|
|
||||||
- uses: actions/download-artifact@v2
|
|
||||||
|
|
||||||
- name: Send message to Slack
|
|
||||||
env:
|
|
||||||
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
|
||||||
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
|
||||||
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
|
||||||
CI_SLACK_CHANNEL_ID_PAST_FUTURE: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
|
|
||||||
|
|
||||||
run: |
|
|
||||||
pip install slack_sdk
|
|
||||||
python utils/notification_service.py scheduled nightly-torch
|
|
||||||
|
6
.github/workflows/self-push.yml
vendored
6
.github/workflows/self-push.yml
vendored
@ -207,7 +207,7 @@ jobs:
|
|||||||
|
|
||||||
# To avoid unknown test failures
|
# To avoid unknown test failures
|
||||||
- name: Pre build DeepSpeed *again*
|
- name: Pre build DeepSpeed *again*
|
||||||
working-directory: /workspace/transformers
|
working-directory: /workspace
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip uninstall -y deepspeed
|
python3 -m pip uninstall -y deepspeed
|
||||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||||
@ -217,10 +217,12 @@ jobs:
|
|||||||
nvidia-smi
|
nvidia-smi
|
||||||
|
|
||||||
- name: Environment
|
- name: Environment
|
||||||
|
working-directory: /workspace/transformers
|
||||||
run: |
|
run: |
|
||||||
python utils/print_env.py
|
python utils/print_env.py
|
||||||
|
|
||||||
- name: Run all non-slow selected tests on GPU
|
- name: Run all non-slow selected tests on GPU
|
||||||
|
working-directory: /workspace/transformers
|
||||||
# TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
|
# TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
|
||||||
run: |
|
run: |
|
||||||
python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
|
python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
|
||||||
@ -256,7 +258,7 @@ jobs:
|
|||||||
|
|
||||||
# To avoid unknown test failures
|
# To avoid unknown test failures
|
||||||
- name: Pre build DeepSpeed *again*
|
- name: Pre build DeepSpeed *again*
|
||||||
working-directory: /workspace/transformers
|
working-directory: /workspace
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip uninstall -y deepspeed
|
python3 -m pip uninstall -y deepspeed
|
||||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||||
|
2
.github/workflows/self-scheduled.yml
vendored
2
.github/workflows/self-scheduled.yml
vendored
@ -308,7 +308,7 @@ jobs:
|
|||||||
|
|
||||||
# To avoid unknown test failures
|
# To avoid unknown test failures
|
||||||
- name: Pre build DeepSpeed *again*
|
- name: Pre build DeepSpeed *again*
|
||||||
working-directory: /workspace/transformers
|
working-directory: /workspace
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip uninstall -y deepspeed
|
python3 -m pip uninstall -y deepspeed
|
||||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||||
|
@ -3,6 +3,9 @@ LABEL maintainer="Hugging Face"
|
|||||||
|
|
||||||
ARG DEBIAN_FRONTEND=noninteractive
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
|
||||||
|
SHELL ["sh", "-lc"]
|
||||||
|
|
||||||
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
|
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
|
||||||
# to be used as arguments for docker build (so far).
|
# to be used as arguments for docker build (so far).
|
||||||
|
|
||||||
@ -21,11 +24,20 @@ ARG REF=main
|
|||||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
|
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
|
# TODO: Handle these in a python utility script
|
||||||
|
RUN [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile
|
||||||
|
RUN echo torch=$VERSION
|
||||||
|
# `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
|
||||||
|
# Currently, let's just use their latest releases (when `torch` is installed with a release version)
|
||||||
|
# TODO: We might need to specify proper versions that work with a specific torch version (especially for past CI).
|
||||||
|
RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir -U tensorflow
|
RUN python3 -m pip install --no-cache-dir -U tensorflow
|
||||||
RUN python3 -m pip uninstall -y flax jax
|
RUN python3 -m pip uninstall -y flax jax
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$PYTORCH+$CUDA.html
|
# Use installed torch version for `torch-scatter` to avid to deal with PYTORCH='pre'.
|
||||||
|
# If torch is nightly version, the link is likely to be invalid, but the installation falls back to the latest torch-scatter
|
||||||
|
RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+$CUDA.html
|
||||||
RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable
|
RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
|
RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
|
||||||
|
@ -3,6 +3,9 @@ LABEL maintainer="Hugging Face"
|
|||||||
|
|
||||||
ARG DEBIAN_FRONTEND=noninteractive
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
# Example: `cu102`, `cu113`, etc.
|
||||||
|
ARG CUDA='cu113'
|
||||||
|
|
||||||
RUN apt -y update
|
RUN apt -y update
|
||||||
RUN apt install -y libaio-dev
|
RUN apt install -y libaio-dev
|
||||||
RUN python3 -m pip install --no-cache-dir --upgrade pip
|
RUN python3 -m pip install --no-cache-dir --upgrade pip
|
||||||
@ -13,13 +16,16 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
|
|||||||
# Install latest release PyTorch
|
# Install latest release PyTorch
|
||||||
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
|
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
|
||||||
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
||||||
RUN python3 -m pip install --no-cache-dir -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
|
RUN python3 -m pip install --no-cache-dir -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
|
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
|
||||||
|
|
||||||
# Pre-build DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
|
# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
|
||||||
RUN python3 -m pip uninstall -y deepspeed
|
RUN python3 -m pip uninstall -y deepspeed
|
||||||
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
# This has to be run (again) inside the GPU VMs running the tests.
|
||||||
|
# The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
|
||||||
|
# TODO: Find out why test fail.
|
||||||
|
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
|
||||||
|
|
||||||
# When installing in editable mode, `transformers` is not recognized as a package.
|
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||||
# this line must be added in order for python to be aware of transformers.
|
# this line must be added in order for python to be aware of transformers.
|
||||||
|
35
docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
Normal file
35
docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
FROM nvcr.io/nvidia/pytorch:21.03-py3
|
||||||
|
LABEL maintainer="Hugging Face"
|
||||||
|
|
||||||
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
# Example: `cu102`, `cu113`, etc.
|
||||||
|
ARG CUDA='cu113'
|
||||||
|
|
||||||
|
RUN apt -y update
|
||||||
|
RUN apt install -y libaio-dev
|
||||||
|
RUN python3 -m pip install --no-cache-dir --upgrade pip
|
||||||
|
|
||||||
|
ARG REF=main
|
||||||
|
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||||
|
|
||||||
|
# Install **nightly** release PyTorch (flag `--pre`)
|
||||||
|
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
|
||||||
|
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
||||||
|
RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
|
||||||
|
|
||||||
|
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
|
||||||
|
|
||||||
|
# Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
|
||||||
|
RUN python3 -m pip uninstall -y deepspeed
|
||||||
|
# This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
|
||||||
|
# Issue: https://github.com/microsoft/DeepSpeed/issues/2010
|
||||||
|
# RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
|
||||||
|
# DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
|
||||||
|
|
||||||
|
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||||
|
# this line must be added in order for python to be aware of transformers.
|
||||||
|
RUN cd transformers && python3 setup.py develop
|
||||||
|
|
||||||
|
# Disable for now as deepspeed is not installed above. To be enabled once the issue is fixed.
|
||||||
|
# RUN python3 -c "from deepspeed.launcher.runner import main"
|
Loading…
Reference in New Issue
Block a user