Skip to content

Commit

Permalink
Merge branch 'main' into smangrul/fix-fsdp-full-state-dict-resume
Browse files Browse the repository at this point in the history
  • Loading branch information
pacman100 committed Dec 15, 2023
2 parents 239641c + ffa04de commit fe81718
Show file tree
Hide file tree
Showing 368 changed files with 23,942 additions and 2,506 deletions.
36 changes: 36 additions & 0 deletions .github/workflows/build-docker-images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -271,3 +271,39 @@ jobs:
REF=main
push: true
tags: huggingface/transformers-tensorflow-gpu

# latest-pytorch-deepspeed-amd:
# name: "PyTorch + DeepSpeed (AMD) [dev]"

# runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
# steps:
# - name: Set up Docker Buildx
# uses: docker/setup-buildx-action@v3
# - name: Check out code
# uses: actions/checkout@v3
# - name: Login to DockerHub
# uses: docker/login-action@v3
# with:
# username: ${{ secrets.DOCKERHUB_USERNAME }}
# password: ${{ secrets.DOCKERHUB_PASSWORD }}
# - name: Build and push
# uses: docker/build-push-action@v5
# with:
# context: ./docker/transformers-pytorch-deepspeed-amd-gpu
# build-args: |
# REF=main
# push: true
# tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
# # Push CI images still need to be re-built daily
# -
# name: Build and push (for Push CI) in a daily basis
# # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
# if: inputs.image_postfix != '-push-ci'
# uses: docker/build-push-action@v5
# with:
# context: ./docker/transformers-pytorch-deepspeed-amd-gpu
# build-args: |
# REF=main
# push: true
# tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
4 changes: 2 additions & 2 deletions .github/workflows/self-nightly-scheduled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ jobs:
python3 -m pip uninstall -y deepspeed
rm -rf DeepSpeed
git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
- name: NVIDIA-SMI
run: |
Expand Down Expand Up @@ -286,4 +286,4 @@ jobs:
with:
name: |
single-*
multi-*
multi-*
4 changes: 2 additions & 2 deletions .github/workflows/self-past.yml
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ jobs:
python3 -m pip uninstall -y deepspeed
rm -rf DeepSpeed
git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
- name: NVIDIA-SMI
run: |
Expand Down Expand Up @@ -353,4 +353,4 @@ jobs:
with:
name: |
single-*
multi-*
multi-*
2 changes: 1 addition & 1 deletion .github/workflows/self-push-amd-mi210-caller.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ on:
jobs:
run_amd_ci:
name: AMD mi210
if: (cancelled() != true) && ((github.event_name == 'push') && (github.ref_name == 'main' || startsWith(github.ref_name, 'run_amd_push_ci_caller')))
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
uses: ./.github/workflows/self-push-amd.yml
with:
gpu_flavor: mi210
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/self-push-amd-mi250-caller.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ on:
jobs:
run_amd_ci:
name: AMD mi250
if: (cancelled() != true) && ((github.event_name == 'push') && (github.ref_name == 'main' || startsWith(github.ref_name, 'run_amd_push_ci_caller')))
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
uses: ./.github/workflows/self-push-amd.yml
with:
gpu_flavor: mi250
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/self-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ jobs:
working-directory: /workspace
run: |
python3 -m pip uninstall -y deepspeed
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
- name: NVIDIA-SMI
run: |
Expand Down Expand Up @@ -456,7 +456,7 @@ jobs:
working-directory: /workspace
run: |
python3 -m pip uninstall -y deepspeed
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
- name: NVIDIA-SMI
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/self-scheduled-amd-mi210-caller.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:
jobs:
run_amd_ci:
name: AMD mi210
if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_scheduled_ci_caller')))
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_scheduled_ci_caller')))
uses: ./.github/workflows/self-scheduled-amd.yml
with:
gpu_flavor: mi210
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/self-scheduled-amd-mi250-caller.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:
jobs:
run_amd_ci:
name: AMD mi250
if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_scheduled_ci_caller')))
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_scheduled_ci_caller')))
uses: ./.github/workflows/self-scheduled-amd.yml
with:
gpu_flavor: mi250
Expand Down
61 changes: 59 additions & 2 deletions .github/workflows/self-scheduled-amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,63 @@ jobs:
name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu

run_tests_torch_deepspeed_gpu:
name: Torch ROCm deepspeed tests
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu, multi-gpu]

runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
needs: setup
container:
image: huggingface/transformers-pytorch-deepspeed-amd-gpu
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}

- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .

- name: ROCM-SMI
run: |
rocm-smi
- name: ROCM-INFO
run: |
rocminfo | grep "Agent" -A 14
- name: Show ROCR environment
run: |
echo "ROCR: $ROCR_VISIBLE_DEVICES"
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze

- name: Run all tests on GPU
working-directory: /transformers
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_deepspeed_gpu tests/deepspeed tests/extended

- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu/failures_short.txt

- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: ${{ matrix.machine_type }}_run_tests_torch_deepspeed_gpu_test_reports
path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu

run_extract_warnings:
name: Extract warnings in CI artifacts
runs-on: ubuntu-22.04
Expand All @@ -368,7 +425,7 @@ jobs:
run_tests_multi_gpu,
run_examples_gpu,
run_pipelines_torch_gpu,
# run_all_tests_torch_cuda_extensions_gpu
run_tests_torch_deepspeed_gpu
]
steps:
- name: Checkout transformers
Expand Down Expand Up @@ -417,7 +474,7 @@ jobs:
run_tests_multi_gpu,
run_examples_gpu,
run_pipelines_torch_gpu,
# run_all_tests_torch_cuda_extensions_gpu,
run_tests_torch_deepspeed_gpu,
run_extract_warnings
]
steps:
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/self-scheduled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ jobs:
working-directory: /workspace
run: |
python3 -m pip uninstall -y deepspeed
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
- name: NVIDIA-SMI
run: |
Expand Down Expand Up @@ -494,5 +494,5 @@ jobs:
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: test_failure_tables
path: test_failure_tables
name: prev_ci_results
path: prev_ci_results
Loading

0 comments on commit fe81718

Please sign in to comment.