[AMD] Add initial version for run_tests_multi_gpu (#26346)

* Add initial version for run_tests_multi_gpu

* Trigger change in BERT

* fix typo setup -> setup_gpu

* Add tag mi210

* Enable multi-gpu jobs

* One more

* Use dynamic device allocation

* Attempt to fix syntax for docker create

* fix script path

* fix

* temp machine type

* fix label

* Enable multi-gpu tests

* Rename multi-amd-gpu to multi-gpu

* Let's not be lazy dude

* Update rocm-smi output

* Add gpu_flavour in the matrix

* Fix typos

* merge single/multi dispatch into the matrix

* Format.

* Revert BERT's change

---------

Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
This commit is contained in:
Funtowicz Morgan 2023-10-03 11:13:45 +02:00 committed by GitHub
parent 768aa3d9cd
commit 3632fb3c25
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -44,28 +44,32 @@ jobs:
needs: check_runner_status
strategy:
matrix:
machine_type: [single-gpu]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
machine_type: [single-gpu, multi-gpu]
gpu_flavor: [mi210]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
container:
# --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: ROCM-SMI
run: |
rocm-smi
rocminfo | grep "Agent" -A 14
- name: Show HIP environment
run: |
echo "HIP: $HIP_VISIBLE_DEVICES"
echo "ROCR: $ROCR_VISIBLE_DEVICES"
setup_gpu:
name: Setup
needs: check_runners
strategy:
matrix:
machine_type: [single-gpu]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
machine_type: [single-gpu, multi-gpu]
gpu_flavor: [mi210]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
container:
# --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
test_map: ${{ steps.set-matrix.outputs.test_map }}
@ -150,7 +154,7 @@ jobs:
echo "matrix=$keys" >> $GITHUB_OUTPUT
echo "test_map=$test_map" >> $GITHUB_OUTPUT
run_tests_single_gpu:
run_tests_amdgpu:
name: Model tests
needs: setup_gpu
# `dummy` means there is no test to run
@ -159,12 +163,12 @@ jobs:
fail-fast: false
matrix:
folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
machine_type: [single-gpu]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
machine_type: [single-gpu, multi-gpu]
gpu_flavor: [mi210]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
container:
# --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
# We also take into account the `push` event (we might want to test some changes in a branch)
@ -216,7 +220,11 @@ jobs:
- name: ROCM-SMI
run: |
rocm-smi
rocminfo | grep "Agent" -A 14
- name: Show HIP environment
run: |
echo "HIP: $HIP_VISIBLE_DEVICES"
echo "ROCR: $ROCR_VISIBLE_DEVICES"
- name: Environment
working-directory: /transformers
@ -252,8 +260,7 @@ jobs:
check_runner_status,
check_runners,
setup_gpu,
run_tests_single_gpu,
# run_tests_multi_gpu,
run_tests_amdgpu,
# run_tests_torch_cuda_extensions_single_gpu,
# run_tests_torch_cuda_extensions_multi_gpu
]