diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 2008d589be4991..0da0e7d612c683 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -44,14 +44,14 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-test - options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: ROCM-SMI run: | rocm-smi - - name: Show HIP environment + - name: Show ROCR environment run: | - echo "HIP: $HIP_VISIBLE_DEVICES" + echo "ROCR: $ROCR_VISIBLE_DEVICES" echo "ROCR: $ROCR_VISIBLE_DEVICES" - name: ROCMINFO run: | @@ -66,7 +66,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-test - options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: @@ -95,13 +95,13 @@ jobs: - name: ROCM-SMI run: | rocm-smi - - name: Show HIP environment + - name: Show ROCR environment run: | - echo "HIP: $HIP_VISIBLE_DEVICES" echo "ROCR: $ROCR_VISIBLE_DEVICES" - name: ROCMINFO run: | rocminfo | grep "Agent" -A 14 + run_tests_single_gpu: name: Single GPU tests strategy: @@ -113,7 +113,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-test - options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Echo folder ${{ matrix.folders }} @@ -127,11 +127,6 @@ jobs: echo "$matrix_folders" echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - name: ROCMINFO run: | rocminfo | grep "Agent" -A 14 @@ -147,9 +142,8 @@ jobs: - name: ROCM-SMI run: | rocm-smi - - name: Show HIP environment + - name: Show ROCR environment run: | - echo "HIP: $HIP_VISIBLE_DEVICES" echo "ROCR: $ROCR_VISIBLE_DEVICES" - name: Environment @@ -188,7 +182,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-test - options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Echo folder ${{ matrix.folders }} @@ -213,9 +207,13 @@ jobs: - name: ROCM-SMI run: | rocm-smi - - name: Show HIP environment + + - name: ROCMINFO + run: | + rocminfo | grep "Agent" -A 14 + + - name: Show ROCR environment run: | - echo "HIP: $HIP_VISIBLE_DEVICES" echo "ROCR: $ROCR_VISIBLE_DEVICES" - name: Environment @@ -252,7 +250,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-test - options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Update clone @@ -266,9 +264,8 @@ jobs: - name: ROCM-SMI run: | rocm-smi - - name: Show HIP environment + - name: Show ROCR environment run: | - echo "HIP: $HIP_VISIBLE_DEVICES" echo "ROCR: $ROCR_VISIBLE_DEVICES" - name: Environment @@ -307,7 +304,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-test - options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Update clone @@ -321,9 +318,8 @@ jobs: - name: ROCM-SMI run: | rocm-smi - - name: Show HIP environment + - name: Show ROCR environment run: | - echo "HIP: $HIP_VISIBLE_DEVICES" echo "ROCR: $ROCR_VISIBLE_DEVICES" - name: Environment