diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 16943139f8448d..45e953602f9b39 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -106,31 +106,19 @@ jobs: run: | python3 utils/print_env.py - run_tests_single_gpu: - name: Single GPU tests + run_examples_gpu: + name: Examples tests strategy: - max-parallel: 1 # For now, not to parallelize. Can change later if it works well. fail-fast: false matrix: - folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [single-gpu] runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup +# needs: [setup, run_tests_single_gpu] + needs: [setup] steps: - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - name: Update clone working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} @@ -158,35 +146,37 @@ jobs: working-directory: /transformers run: pip freeze - - name: Run all tests on GPU + - name: Run examples tests on GPU working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + run: | + pip install -r examples/pytorch/_tests_requirements.txt + python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + name: ${{ matrix.machine_type }}_run_examples_gpu + path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu - run_tests_multi_gpu: - name: Multi GPU tests + run_tests_single_gpu: + name: Single GPU tests strategy: - max-parallel: 1 + max-parallel: 1 # For now, not to parallelize. Can change later if it works well. fail-fast: false matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [multi-gpu] + machine_type: [single-gpu] runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup + needs: [setup, run_examples_gpu] steps: - name: Echo folder ${{ matrix.folders }} shell: bash @@ -242,19 +232,31 @@ jobs: name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - run_examples_gpu: - name: Examples tests + run_tests_multi_gpu: + name: Multi GPU tests strategy: + max-parallel: 1 fail-fast: false matrix: - machine_type: [single-gpu] + folders: ${{ fromJson(needs.setup.outputs.matrix) }} + machine_type: [multi-gpu] runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -# needs: [setup, run_tests_single_gpu] - needs: [setup] + needs: setup steps: + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + - name: Update clone working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} @@ -282,23 +284,23 @@ jobs: working-directory: /transformers run: pip freeze - - name: Run examples tests on GPU + - name: Run all tests on GPU working-directory: /transformers - run: | - pip install -r examples/pytorch/_tests_requirements.txt - python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_examples_gpu - path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + + run_pipelines_torch_gpu: name: PyTorch pipelines tests