diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index 210076f1416..ed43d5c02cc 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -5,6 +5,7 @@ on: branches: - master - ci_* + - ci-* paths: - "src/**" - "tests/**" @@ -186,11 +187,85 @@ jobs: name: run_all_tests_tf_multi_gpu_test_reports path: reports + run_tests_torch_cuda_extensions_gpu: + runs-on: [self-hosted, docker-gpu, single-gpu] + container: + image: nvcr.io/nvidia/pytorch:21.03-py3 + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Launcher docker + uses: actions/checkout@v2 + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install .[testing,deepspeed] + + - name: Are GPUs recognized by our DL frameworks + run: | + python -c "import torch; print('Cuda available:', torch.cuda.is_available())" + python -c "import torch; print('Cuda version:', torch.version.cuda)" + python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" + python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" + + - name: Run all tests on GPU + run: | + python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended + + - name: Failure short reports + if: ${{ always() }} + run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt + + run_tests_torch_cuda_extensions_multi_gpu: + runs-on: [self-hosted, docker-gpu, multi-gpu] + container: + image: nvcr.io/nvidia/pytorch:21.03-py3 + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Launcher docker + uses: actions/checkout@v2 + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install .[testing,deepspeed,fairscale] + + - name: Are GPUs recognized by our DL frameworks + run: | + python -c "import torch; print('Cuda available:', torch.cuda.is_available())" + python -c "import torch; print('Cuda version:', torch.version.cuda)" + python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" + python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" + + - name: Run all tests on GPU + run: | + python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended + + - name: Failure short reports + if: ${{ always() }} + run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt + + send_results: name: Send results to webhook runs-on: ubuntu-latest if: always() - needs: [run_tests_torch_gpu, run_tests_tf_gpu, run_tests_torch_multi_gpu, run_tests_tf_multi_gpu] + needs: [ + run_tests_torch_gpu, + run_tests_tf_gpu, + run_tests_torch_multi_gpu, + run_tests_tf_multi_gpu, + run_tests_torch_cuda_extensions_gpu, + run_tests_torch_cuda_extensions_multi_gpu + ] steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 978d9e02a69..df9148c38e0 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -246,11 +246,84 @@ jobs: name: run_all_tests_tf_multi_gpu_test_reports path: reports + run_all_tests_torch_cuda_extensions_gpu: + runs-on: [self-hosted, docker-gpu, single-gpu] + container: + image: nvcr.io/nvidia/pytorch:21.03-py3 + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Launcher docker + uses: actions/checkout@v2 + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install .[testing,deepspeed] + + - name: Are GPUs recognized by our DL frameworks + run: | + python -c "import torch; print('Cuda available:', torch.cuda.is_available())" + python -c "import torch; print('Cuda version:', torch.version.cuda)" + python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" + python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" + + - name: Run all tests on GPU + run: | + python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended + + - name: Failure short reports + if: ${{ always() }} + run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt + + run_all_tests_torch_cuda_extensions_multi_gpu: + runs-on: [self-hosted, docker-gpu, multi-gpu] + container: + image: nvcr.io/nvidia/pytorch:21.03-py3 + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Launcher docker + uses: actions/checkout@v2 + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install .[testing,deepspeed,fairscale] + + - name: Are GPUs recognized by our DL frameworks + run: | + python -c "import torch; print('Cuda available:', torch.cuda.is_available())" + python -c "import torch; print('Cuda version:', torch.version.cuda)" + python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" + python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" + + - name: Run all tests on GPU + run: | + python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended + + - name: Failure short reports + if: ${{ always() }} + run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt + send_results: name: Send results to webhook runs-on: ubuntu-latest if: always() - needs: [run_all_tests_torch_gpu, run_all_tests_tf_gpu, run_all_tests_torch_multi_gpu, run_all_tests_tf_multi_gpu] + needs: [ + run_all_tests_torch_gpu, + run_all_tests_tf_gpu, + run_all_tests_torch_multi_gpu, + run_all_tests_tf_multi_gpu, + run_all_tests_torch_cuda_extensions_gpu, + run_all_tests_torch_cuda_extensions_multi_gpu + ] steps: - uses: actions/checkout@v2