From ebbd69f79c094c312a1b0f2b7d614ee5812d0f2e Mon Sep 17 00:00:00 2001 From: ydshieh Date: Fri, 17 Nov 2023 10:33:23 +0100 Subject: [PATCH] fix --- .github/workflows/self-scheduled-amd.yml | 48 +++++++++++++++--------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 3331f269227e48..3af97045ca2b2a 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -44,14 +44,16 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: ROCM-SMI run: | rocm-smi - - name: Show HIP environment + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment run: | - echo "HIP: $HIP_VISIBLE_DEVICES" echo "ROCR: $ROCR_VISIBLE_DEVICES" setup: @@ -63,7 +65,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: @@ -92,9 +94,11 @@ jobs: - name: ROCM-SMI run: | rocm-smi - - name: Show HIP environment + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment run: | - echo "HIP: $HIP_VISIBLE_DEVICES" echo "ROCR: $ROCR_VISIBLE_DEVICES" run_tests_single_gpu: @@ -108,7 +112,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Echo folder ${{ matrix.folders }} @@ -133,9 +137,11 @@ jobs: - name: ROCM-SMI run: | rocm-smi - - name: Show HIP environment + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment run: | - echo "HIP: $HIP_VISIBLE_DEVICES" echo "ROCR: $ROCR_VISIBLE_DEVICES" - name: Environment @@ -174,7 +180,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Echo folder ${{ matrix.folders }} @@ -199,9 +205,11 @@ jobs: - name: ROCM-SMI run: | rocm-smi - - name: Show HIP environment + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment run: | - echo "HIP: $HIP_VISIBLE_DEVICES" echo "ROCR: $ROCR_VISIBLE_DEVICES" - name: Environment @@ -238,7 +246,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Update clone @@ -252,9 +260,11 @@ jobs: - name: ROCM-SMI run: | rocm-smi - - name: Show HIP environment + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment run: | - echo "HIP: $HIP_VISIBLE_DEVICES" echo "ROCR: $ROCR_VISIBLE_DEVICES" - name: Environment @@ -293,7 +303,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Update clone @@ -307,9 +317,11 @@ jobs: - name: ROCM-SMI run: | rocm-smi - - name: Show HIP environment + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment run: | - echo "HIP: $HIP_VISIBLE_DEVICES" echo "ROCR: $ROCR_VISIBLE_DEVICES" - name: Environment