From 78579787b4abd6a762dcaf4c168baca3baa77377 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 21 Nov 2023 15:37:00 +0100 Subject: [PATCH] add deepspeed scheduled test for amd --- .github/workflows/self-scheduled-amd.yml | 57 +++++++++++++++++++++++- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 17e907e40a5757..09f7ad17605c56 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -356,6 +356,59 @@ jobs: name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu + run_all_tests_torch_rocm_deepspeed_gpu: + name: Torch ROCm extension tests + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + needs: setup + container: + image: huggingface/transformers-pytorch-amd-gpu + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[deepspeed] + + - name: ROCM-SMI + run: | + rocm-smi + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /workspace/transformers + run: | + python utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /workspace/transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /workspace/transformers + run: python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_rocm_extensions_gpu tests/deepspeed tests/extended + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_rocm_extensions_gpu/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_tests_torch_rocm_extensions_gpu_test_reports + path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_rocm_extensions_gpu + run_extract_warnings: name: Extract warnings in CI artifacts runs-on: ubuntu-22.04 @@ -368,7 +421,7 @@ jobs: run_tests_multi_gpu, run_examples_gpu, run_pipelines_torch_gpu, - # run_all_tests_torch_cuda_extensions_gpu + run_all_tests_torch_rocm_deepspeed_gpu ] steps: - name: Checkout transformers @@ -417,7 +470,7 @@ jobs: run_tests_multi_gpu, run_examples_gpu, run_pipelines_torch_gpu, - # run_all_tests_torch_cuda_extensions_gpu, + run_all_tests_torch_rocm_deepspeed_gpu, run_extract_warnings ] steps: