mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
[AMD] Add initial version for run_tests_multi_gpu (#26346)
* Add initial version for run_tests_multi_gpu * Trigger change in BERT * fix typo setup -> setup_gpu * Add tag mi210 * Enable multi-gpu jobs * One more * Use dynamic device allocation * Attempt to fix syntax for docker create * fix script path * fix * temp machine type * fix label * Enable multi-gpu tests * Rename multi-amd-gpu to multi-gpu * Let's not be lazy dude * Update rocm-smi output * Add gpu_flavour in the matrix * Fix typos * merge single/multi dispatch into the matrix * Format. * Revert BERT's change --------- Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
This commit is contained in:
parent
768aa3d9cd
commit
3632fb3c25
41
.github/workflows/self-push-amd.yml
vendored
41
.github/workflows/self-push-amd.yml
vendored
@ -44,28 +44,32 @@ jobs:
|
||||
needs: check_runner_status
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
gpu_flavor: [mi210]
|
||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
|
||||
container:
|
||||
# --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
|
||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||
options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: ROCM-SMI
|
||||
run: |
|
||||
rocm-smi
|
||||
rocminfo | grep "Agent" -A 14
|
||||
- name: Show HIP environment
|
||||
run: |
|
||||
echo "HIP: $HIP_VISIBLE_DEVICES"
|
||||
echo "ROCR: $ROCR_VISIBLE_DEVICES"
|
||||
|
||||
setup_gpu:
|
||||
name: Setup
|
||||
needs: check_runners
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
gpu_flavor: [mi210]
|
||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
|
||||
container:
|
||||
# --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
|
||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||
options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
test_map: ${{ steps.set-matrix.outputs.test_map }}
|
||||
@ -150,7 +154,7 @@ jobs:
|
||||
echo "matrix=$keys" >> $GITHUB_OUTPUT
|
||||
echo "test_map=$test_map" >> $GITHUB_OUTPUT
|
||||
|
||||
run_tests_single_gpu:
|
||||
run_tests_amdgpu:
|
||||
name: Model tests
|
||||
needs: setup_gpu
|
||||
# `dummy` means there is no test to run
|
||||
@ -159,12 +163,12 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
|
||||
machine_type: [single-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
gpu_flavor: [mi210]
|
||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
|
||||
container:
|
||||
# --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
|
||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||
options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
|
||||
# We also take into account the `push` event (we might want to test some changes in a branch)
|
||||
@ -216,7 +220,11 @@ jobs:
|
||||
|
||||
- name: ROCM-SMI
|
||||
run: |
|
||||
rocm-smi
|
||||
rocminfo | grep "Agent" -A 14
|
||||
- name: Show HIP environment
|
||||
run: |
|
||||
echo "HIP: $HIP_VISIBLE_DEVICES"
|
||||
echo "ROCR: $ROCR_VISIBLE_DEVICES"
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
@ -252,8 +260,7 @@ jobs:
|
||||
check_runner_status,
|
||||
check_runners,
|
||||
setup_gpu,
|
||||
run_tests_single_gpu,
|
||||
# run_tests_multi_gpu,
|
||||
run_tests_amdgpu,
|
||||
# run_tests_torch_cuda_extensions_single_gpu,
|
||||
# run_tests_torch_cuda_extensions_multi_gpu
|
||||
]
|
||||
|
Loading…
Reference in New Issue
Block a user