[AMD] Add initial version for run_tests_multi_gpu (huggingface#26346)

* Add initial version for run_tests_multi_gpu * Trigger change in BERT * fix typo setup -> setup_gpu * Add tag mi210 * Enable multi-gpu jobs * One more * Use dynamic device allocation * Attempt to fix syntax for docker create * fix script path * fix * temp machine type * fix label * Enable multi-gpu tests * Rename multi-amd-gpu to multi-gpu * Let's not be lazy dude * Update rocm-smi output * Add gpu_flavour in the matrix * Fix typos * merge single/multi dispatch into the matrix * Format. * Revert BERT's change --------- Co-authored-by: Guillaume LEGENDRE <[email protected]>
blbadger · Nov 8, 2023 · 1138a9c · 1138a9c
1 parent bc43784
commit 1138a9c
Showing 1 changed file with 24 additions and 17 deletions.
diff --git a/.github/workflows/self-push-amd.yml b/.github/workflows/self-push-amd.yml
@@ -44,28 +44,32 @@ jobs:
     needs: check_runner_status
     strategy:
       matrix:
-        machine_type: [single-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
+        machine_type: [single-gpu, multi-gpu]
+        gpu_flavor: [mi210]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
     container:
-      # --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
       image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
-      options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
       - name: ROCM-SMI
         run: |
-          rocm-smi
+          rocminfo  | grep "Agent" -A 14
+      - name: Show HIP environment
+        run: |
+          echo "HIP: $HIP_VISIBLE_DEVICES"
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
 
   setup_gpu:
     name: Setup
     needs: check_runners
     strategy:
       matrix:
-        machine_type: [single-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
+        machine_type: [single-gpu, multi-gpu]
+        gpu_flavor: [mi210]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
     container:
-      # --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
       image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
-      options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       test_map: ${{ steps.set-matrix.outputs.test_map }}
@@ -150,7 +154,7 @@ jobs:
           echo "matrix=$keys" >> $GITHUB_OUTPUT
           echo "test_map=$test_map" >> $GITHUB_OUTPUT
 
-  run_tests_single_gpu:
+  run_tests_amdgpu:
     name: Model tests
     needs: setup_gpu
     # `dummy` means there is no test to run
@@ -159,12 +163,12 @@ jobs:
       fail-fast: false
       matrix:
         folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
-        machine_type: [single-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
+        machine_type: [single-gpu, multi-gpu]
+        gpu_flavor: [mi210]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
     container:
-      # --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
       image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
-      options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
       # Necessary to get the correct branch name and commit SHA for `workflow_run` event
       # We also take into account the `push` event (we might want to test some changes in a branch)
@@ -216,7 +220,11 @@ jobs:
 
       - name: ROCM-SMI
         run: |
-          rocm-smi
+          rocminfo  | grep "Agent" -A 14
+      - name: Show HIP environment
+        run: |
+          echo "HIP: $HIP_VISIBLE_DEVICES"
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
 
       - name: Environment
         working-directory: /transformers
@@ -252,8 +260,7 @@ jobs:
         check_runner_status,
         check_runners,
         setup_gpu,
-        run_tests_single_gpu,
-#        run_tests_multi_gpu,
+        run_tests_amdgpu,
 #        run_tests_torch_cuda_extensions_single_gpu,
 #        run_tests_torch_cuda_extensions_multi_gpu
     ]