Skip to content

Commit

Permalink
add deepspeed scheduled test for amd
Browse files Browse the repository at this point in the history
  • Loading branch information
echarlaix committed Nov 21, 2023
1 parent f93c1e9 commit 7857978
Showing 1 changed file with 55 additions and 2 deletions.
57 changes: 55 additions & 2 deletions .github/workflows/self-scheduled-amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,59 @@ jobs:
name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu

run_all_tests_torch_rocm_deepspeed_gpu:
name: Torch ROCm extension tests
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu, multi-gpu]

runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
needs: setup
container:
image: huggingface/transformers-pytorch-amd-gpu
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}

- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[deepspeed]

- name: ROCM-SMI
run: |
rocm-smi
- name: Show ROCR environment
run: |
echo "ROCR: $ROCR_VISIBLE_DEVICES"
- name: Environment
working-directory: /workspace/transformers
run: |
python utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /workspace/transformers
run: pip freeze

- name: Run all tests on GPU
working-directory: /workspace/transformers
run: python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_rocm_extensions_gpu tests/deepspeed tests/extended

- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_rocm_extensions_gpu/failures_short.txt

- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: ${{ matrix.machine_type }}_run_tests_torch_rocm_extensions_gpu_test_reports
path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_rocm_extensions_gpu

run_extract_warnings:
name: Extract warnings in CI artifacts
runs-on: ubuntu-22.04
Expand All @@ -368,7 +421,7 @@ jobs:
run_tests_multi_gpu,
run_examples_gpu,
run_pipelines_torch_gpu,
# run_all_tests_torch_cuda_extensions_gpu
run_all_tests_torch_rocm_deepspeed_gpu
]
steps:
- name: Checkout transformers
Expand Down Expand Up @@ -417,7 +470,7 @@ jobs:
run_tests_multi_gpu,
run_examples_gpu,
run_pipelines_torch_gpu,
# run_all_tests_torch_cuda_extensions_gpu,
run_all_tests_torch_rocm_deepspeed_gpu,
run_extract_warnings
]
steps:
Expand Down

0 comments on commit 7857978

Please sign in to comment.