Skip to content

Commit

Permalink
[AMD] Add initial version for run_tests_multi_gpu (huggingface#26346)
Browse files Browse the repository at this point in the history
* Add initial version for run_tests_multi_gpu

* Trigger change in BERT

* fix typo setup -> setup_gpu

* Add tag mi210

* Enable multi-gpu jobs

* One more

* Use dynamic device allocation

* Attempt to fix syntax for docker create

* fix script path

* fix

* temp machine type

* fix label

* Enable multi-gpu tests

* Rename multi-amd-gpu to multi-gpu

* Let's not be lazy dude

* Update rocm-smi output

* Add gpu_flavour in the matrix

* Fix typos

* merge single/multi dispatch into the matrix

* Format.

* Revert BERT's change

---------

Co-authored-by: Guillaume LEGENDRE <[email protected]>
  • Loading branch information
2 people authored and blbadger committed Nov 8, 2023
1 parent bc43784 commit 1138a9c
Showing 1 changed file with 24 additions and 17 deletions.
41 changes: 24 additions & 17 deletions .github/workflows/self-push-amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,28 +44,32 @@ jobs:
needs: check_runner_status
strategy:
matrix:
machine_type: [single-gpu]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
machine_type: [single-gpu, multi-gpu]
gpu_flavor: [mi210]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
container:
# --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: ROCM-SMI
run: |
rocm-smi
rocminfo | grep "Agent" -A 14
- name: Show HIP environment
run: |
echo "HIP: $HIP_VISIBLE_DEVICES"
echo "ROCR: $ROCR_VISIBLE_DEVICES"
setup_gpu:
name: Setup
needs: check_runners
strategy:
matrix:
machine_type: [single-gpu]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
machine_type: [single-gpu, multi-gpu]
gpu_flavor: [mi210]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
container:
# --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
test_map: ${{ steps.set-matrix.outputs.test_map }}
Expand Down Expand Up @@ -150,7 +154,7 @@ jobs:
echo "matrix=$keys" >> $GITHUB_OUTPUT
echo "test_map=$test_map" >> $GITHUB_OUTPUT
run_tests_single_gpu:
run_tests_amdgpu:
name: Model tests
needs: setup_gpu
# `dummy` means there is no test to run
Expand All @@ -159,12 +163,12 @@ jobs:
fail-fast: false
matrix:
folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
machine_type: [single-gpu]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
machine_type: [single-gpu, multi-gpu]
gpu_flavor: [mi210]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
container:
# --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
# We also take into account the `push` event (we might want to test some changes in a branch)
Expand Down Expand Up @@ -216,7 +220,11 @@ jobs:
- name: ROCM-SMI
run: |
rocm-smi
rocminfo | grep "Agent" -A 14
- name: Show HIP environment
run: |
echo "HIP: $HIP_VISIBLE_DEVICES"
echo "ROCR: $ROCR_VISIBLE_DEVICES"
- name: Environment
working-directory: /transformers
Expand Down Expand Up @@ -252,8 +260,7 @@ jobs:
check_runner_status,
check_runners,
setup_gpu,
run_tests_single_gpu,
# run_tests_multi_gpu,
run_tests_amdgpu,
# run_tests_torch_cuda_extensions_single_gpu,
# run_tests_torch_cuda_extensions_multi_gpu
]
Expand Down

0 comments on commit 1138a9c

Please sign in to comment.