transformers/.github/workflows/self-push.yml

name: Self-hosted runner (push)

on:
  push:
    branches:
      - master
      - ci_*
      - ci-*
    paths:
      - "src/**"
      - "tests/**"
      - ".github/**"
      - "templates/**"
      - "utils/**"
  repository_dispatch:

env:
  HF_HOME: /mnt/cache
  TRANSFORMERS_IS_CI: yes
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
  PYTEST_TIMEOUT: 60

jobs:
  run_tests_torch_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
    container:
      image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
        with:
          fetch-depth: 2

      - name: NVIDIA-SMI
        run: |
          nvidia-smi

      - name: Install dependencies
        run: |
          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]

      - name: Are GPUs recognized by our DL frameworks
        run: |
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
          python -c "import torch; print('Cuda version:', torch.version.cuda)"
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

#      - name: Fetch the tests to run
#        run: |
#          python utils/tests_fetcher.py | tee test_preparation.txt
#
#      - name: Report fetched tests
#        uses: actions/upload-artifact@v2
#        with:
#          name: test_fetched
#          path: test_preparation.txt
#
      - name: Run all non-slow tests on GPU
        run: |
           python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_gpu tests
#          if [ -f test_list.txt ]; then
#            python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_gpu $(cat test_list.txt)
#          fi

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_gpu_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_tests_torch_gpu_test_reports
          path: reports

#  run_tests_tf_gpu:
#    runs-on: [self-hosted, docker-gpu, single-gpu]
#    timeout-minutes: 120
#    container:
#      image: tensorflow/tensorflow:2.4.1-gpu
#      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
#    steps:
#      - name: Launcher docker
#        uses: actions/checkout@v2
#        with:
#          fetch-depth: 2
#
#      - name: NVIDIA-SMI
#        run: |
#          nvidia-smi
#
#      - name: Install dependencies
#        run: |
#          apt -y update && apt install -y git
#          pip install --upgrade pip
#          pip install .[sklearn,testing,onnxruntime,sentencepiece]
#
#      - name: Are GPUs recognized by our DL frameworks
#        run: |
#          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
#          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
#
#      - name: Fetch the tests to run
#        run: |
#          python utils/tests_fetcher.py | tee test_preparation.txt
#
#      - name: Report fetched tests
#        uses: actions/upload-artifact@v2
#        with:
#          name: test_fetched
#          path: test_preparation.txt
#
#      - name: Run all non-slow tests on GPU
#        env:
#          TF_NUM_INTRAOP_THREADS: 8
#          TF_NUM_INTEROP_THREADS: 1
#        run: |
#          if [ -f test_list.txt ]; then
#            python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu $(cat test_list.txt)
#          fi
#
#      - name: Failure short reports
#        if: ${{ always() }}
#        run: cat reports/tests_tf_gpu_failures_short.txt
#
#      - name: Test suite reports artifacts
#        if: ${{ always() }}
#        uses: actions/upload-artifact@v2
#        with:
#          name: run_all_tests_tf_gpu_test_reports
#          path: reports


  run_tests_torch_multi_gpu:
    runs-on: [self-hosted, docker-gpu, multi-gpu]
    container:
      image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
        with:
          fetch-depth: 2

      - name: NVIDIA-SMI
        run: |
          nvidia-smi

      - name: Install dependencies
        run: |
          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]

      - name: Are GPUs recognized by our DL frameworks
        run: |
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
          python -c "import torch; print('Cuda version:', torch.version.cuda)"
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

#      - name: Fetch the tests to run
#        run: |
#          python utils/tests_fetcher.py | tee test_preparation.txt
#
#      - name: Report fetched tests
#        uses: actions/upload-artifact@v2
#        with:
#          name: test_fetched
#          path: test_preparation.txt

      - name: Run all non-slow tests on GPU
        env:
          MKL_SERVICE_FORCE_INTEL: 1
        run: |
          python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_multi_gpu tests
#          if [ -f test_list.txt ]; then
#            python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_multi_gpu $(cat test_list.txt)
#          fi

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_multi_gpu_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_tests_torch_multi_gpu_test_reports
          path: reports

#  run_tests_tf_multi_gpu:
#    runs-on: [self-hosted, docker-gpu, multi-gpu]
#    timeout-minutes: 120
#    container:
#      image: tensorflow/tensorflow:2.4.1-gpu
#      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
#    steps:
#      - name: Launcher docker
#        uses: actions/checkout@v2
#        with:
#          fetch-depth: 2
#
#      - name: NVIDIA-SMI
#        run: |
#          nvidia-smi
#
#      - name: Install dependencies
#        run: |
#          apt -y update && apt install -y git
#          pip install --upgrade pip
#          pip install .[sklearn,testing,onnxruntime,sentencepiece]
#
#      - name: Are GPUs recognized by our DL frameworks
#        run: |
#          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
#          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
#
#      - name: Fetch the tests to run
#        run: |
#          python utils/tests_fetcher.py | tee test_preparation.txt
#
#      - name: Report fetched tests
#        uses: actions/upload-artifact@v2
#        with:
#          name: test_fetched
#          path: test_preparation.txt
#
#      - name: Run all non-slow tests on GPU
#        env:
#          TF_NUM_INTRAOP_THREADS: 8
#          TF_NUM_INTEROP_THREADS: 1
#        run: |
#          if [ -f test_list.txt ]; then
#            python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu $(cat test_list.txt)
#          fi
#
#      - name: Failure short reports
#        if: ${{ always() }}
#        run: cat reports/tests_tf_multi_gpu_failures_short.txt
#
#      - name: Test suite reports artifacts
#        if: ${{ always() }}
#        uses: actions/upload-artifact@v2
#        with:
#          name: run_all_tests_tf_multi_gpu_test_reports
#          path: reports

  run_tests_torch_cuda_extensions_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
    container:
      image: nvcr.io/nvidia/pytorch:21.03-py3
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
        with:
          fetch-depth: 2

      - name: NVIDIA-SMI
        run: |
          nvidia-smi

      - name: Install dependencies
        run: |
          apt -y update && apt install -y libaio-dev
          pip install --upgrade pip
          pip install .[testing,deepspeed]

      - name: Are GPUs recognized by our DL frameworks
        run: |
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
          python -c "import torch; print('Cuda version:', torch.version.cuda)"
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

#      - name: Fetch the tests to run
#        run: |
#          python utils/tests_fetcher.py --filters tests/deepspeed tests/extended | tee test_preparation.txt
#
#      - name: Report fetched tests
#        uses: actions/upload-artifact@v2
#        with:
#          name: test_fetched
#          path: test_preparation.txt

      - name: Run all tests on GPU
        run: |
          python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
#          if [ -f test_list.txt ]; then
#            python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_gpu $(cat test_list.txt)
#          fi

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_tests_torch_cuda_extensions_gpu_test_reports
          path: reports

  run_tests_torch_cuda_extensions_multi_gpu:
    runs-on: [self-hosted, docker-gpu, multi-gpu]
    container:
      image: nvcr.io/nvidia/pytorch:21.03-py3
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
        with:
          fetch-depth: 2

      - name: NVIDIA-SMI
        run: |
          nvidia-smi

      - name: Install dependencies
        run: |
          apt -y update && apt install -y libaio-dev
          pip install --upgrade pip
          pip install .[testing,deepspeed,fairscale]

      - name: Are GPUs recognized by our DL frameworks
        run: |
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
          python -c "import torch; print('Cuda version:', torch.version.cuda)"
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

#      - name: Fetch the tests to run
#        run: |
#          python utils/tests_fetcher.py --filters tests/deepspeed tests/extended | tee test_preparation.txt
#
#      - name: Report fetched tests
#        uses: actions/upload-artifact@v2
#        with:
#          name: test_fetched
#          path: test_preparation.txt

      - name: Run all tests on GPU
        run: |
          python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
#          if [ -f test_list.txt ]; then
#            python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_multi_gpu $(cat test_list.txt)
#          fi

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
          path: reports


  send_results:
    name: Send results to webhook
    runs-on: ubuntu-latest
    if: always()
    needs: [
        run_tests_torch_gpu,
#        run_tests_tf_gpu,
        run_tests_torch_multi_gpu,
#        run_tests_tf_multi_gpu,
        run_tests_torch_cuda_extensions_gpu,
        run_tests_torch_cuda_extensions_multi_gpu
    ]
    steps:
      - uses: actions/checkout@v2

      - uses: actions/download-artifact@v2

      - name: Send message to Slack
        env:
          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}

        run: |
          pip install slack_sdk
          python utils/notification_service.py push