diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index f109b0cd263ef5..6b4ef7a1b6b058 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -89,7 +89,7 @@ jobs: name: Identify models to test working-directory: /transformers/tests run: | - echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2[:5] + d1[:5]; print(d)')" >> $GITHUB_OUTPUT + echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2[:5] + d1[:0]; print(d)')" >> $GITHUB_OUTPUT - name: ROCM-SMI run: | @@ -242,62 +242,62 @@ jobs: name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - run_examples_gpu: - name: Examples tests - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run examples tests on GPU - working-directory: /transformers - run: | - pip install -r examples/pytorch/_tests_requirements.txt - python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_examples_gpu - path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu +# run_examples_gpu: +# name: Examples tests +# strategy: +# fail-fast: false +# matrix: +# machine_type: [single-gpu] +# runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] +# container: +# image: huggingface/transformers-pytorch-amd-gpu +# options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ +# needs: [setup, run_tests_single_gpu] +# steps: +# - name: Update clone +# working-directory: /transformers +# run: git fetch && git checkout ${{ github.sha }} +# +# - name: Reinstall transformers in edit mode (remove the one installed during docker image build) +# working-directory: /transformers +# run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . +# +# - name: ROCM-SMI +# run: | +# rocm-smi +# - name: ROCM-INFO +# run: | +# rocminfo | grep "Agent" -A 14 +# - name: Show ROCR environment +# run: | +# echo "ROCR: $ROCR_VISIBLE_DEVICES" +# +# - name: Environment +# working-directory: /transformers +# run: | +# python3 utils/print_env.py +# +# - name: Show installed libraries and their versions +# working-directory: /transformers +# run: pip freeze +# +# - name: Run examples tests on GPU +# working-directory: /transformers +# run: | +# pip install -r examples/pytorch/_tests_requirements.txt +# python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch +# +# - name: Failure short reports +# if: ${{ failure() }} +# continue-on-error: true +# run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt +# +# - name: Test suite reports artifacts +# if: ${{ always() }} +# uses: actions/upload-artifact@v3 +# with: +# name: ${{ matrix.machine_type }}_run_examples_gpu +# path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu run_pipelines_torch_gpu: name: PyTorch pipelines tests @@ -309,7 +309,7 @@ jobs: container: image: huggingface/transformers-pytorch-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup + needs: [setup, run_tests_single_gpu, run_tests_multi_gpu] steps: - name: Update clone working-directory: /transformers