diff --git a/.github/workflows/self-push-amd.yml b/.github/workflows/self-push-amd.yml index b1271aef601..0dfbbca7ba1 100644 --- a/.github/workflows/self-push-amd.yml +++ b/.github/workflows/self-push-amd.yml @@ -44,28 +44,32 @@ jobs: needs: check_runner_status strategy: matrix: - machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210] + machine_type: [single-gpu, multi-gpu] + gpu_flavor: [mi210] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}'] container: - # --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...) image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now - options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: ROCM-SMI run: | - rocm-smi + rocminfo | grep "Agent" -A 14 + - name: Show HIP environment + run: | + echo "HIP: $HIP_VISIBLE_DEVICES" + echo "ROCR: $ROCR_VISIBLE_DEVICES" setup_gpu: name: Setup needs: check_runners strategy: matrix: - machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210] + machine_type: [single-gpu, multi-gpu] + gpu_flavor: [mi210] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}'] container: - # --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...) image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now - options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} test_map: ${{ steps.set-matrix.outputs.test_map }} @@ -150,7 +154,7 @@ jobs: echo "matrix=$keys" >> $GITHUB_OUTPUT echo "test_map=$test_map" >> $GITHUB_OUTPUT - run_tests_single_gpu: + run_tests_amdgpu: name: Model tests needs: setup_gpu # `dummy` means there is no test to run @@ -159,12 +163,12 @@ jobs: fail-fast: false matrix: folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }} - machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210] + machine_type: [single-gpu, multi-gpu] + gpu_flavor: [mi210] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}'] container: - # --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...) image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now - options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: # Necessary to get the correct branch name and commit SHA for `workflow_run` event # We also take into account the `push` event (we might want to test some changes in a branch) @@ -216,7 +220,11 @@ jobs: - name: ROCM-SMI run: | - rocm-smi + rocminfo | grep "Agent" -A 14 + - name: Show HIP environment + run: | + echo "HIP: $HIP_VISIBLE_DEVICES" + echo "ROCR: $ROCR_VISIBLE_DEVICES" - name: Environment working-directory: /transformers @@ -252,8 +260,7 @@ jobs: check_runner_status, check_runners, setup_gpu, - run_tests_single_gpu, -# run_tests_multi_gpu, + run_tests_amdgpu, # run_tests_torch_cuda_extensions_single_gpu, # run_tests_torch_cuda_extensions_multi_gpu ]