From c19c19f94613925b595121eb7effe8a331e81a58 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 11 Dec 2023 16:33:36 +0100 Subject: [PATCH] Add deepspeed test to amd scheduled CI (#27633) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add deepspeed scheduled test for amd * fix image * add dockerfile * add comment * enable tests * trigger * remove trigger for this branch * trigger * change runner env to trigger the docker build image test * use new docker image * remove test suffix from docker image tag * replace test docker image with original image * push new image * Trigger * add back amd tests * fix typo * add amd tests back * fix * comment until docker image build scheduled test fix * remove deprecated deepspeed build option * upgrade torch * update docker & make tests pass * Update docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile * fix * tmp disable test * precompile deepspeed to avoid timeout during tests * fix comment * trigger deepspeed tests with new image * comment tests * trigger * add sklearn dependency to fix slow tests * enable back other tests * final update --------- Co-authored-by: Felix Marty Co-authored-by: FĂ©lix Marty <9808326+fxmarty@users.noreply.github.com> Co-authored-by: ydshieh --- .github/workflows/build-docker-images.yml | 36 +++++++++++ .github/workflows/self-nightly-scheduled.yml | 4 +- .github/workflows/self-past.yml | 4 +- .github/workflows/self-push.yml | 4 +- .github/workflows/self-scheduled-amd.yml | 61 ++++++++++++++++++- .github/workflows/self-scheduled.yml | 2 +- .../transformers-pytorch-amd-gpu/Dockerfile | 4 ++ .../Dockerfile | 45 ++++++++++++++ .../Dockerfile | 4 +- tests/deepspeed/test_deepspeed.py | 4 +- 10 files changed, 155 insertions(+), 13 deletions(-) create mode 100644 docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index b267ad7882d89f..be070a95d3a94f 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -271,3 +271,39 @@ jobs: REF=main push: true tags: huggingface/transformers-tensorflow-gpu + + # latest-pytorch-deepspeed-amd: + # name: "PyTorch + DeepSpeed (AMD) [dev]" + + # runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] + # steps: + # - name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - name: Check out code + # uses: actions/checkout@v3 + # - name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-deepspeed-amd-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }} + # # Push CI images still need to be re-built daily + # - + # name: Build and push (for Push CI) in a daily basis + # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + # if: inputs.image_postfix != '-push-ci' + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-deepspeed-amd-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml index e4b4f7f77cf077..37dc98f340a16d 100644 --- a/.github/workflows/self-nightly-scheduled.yml +++ b/.github/workflows/self-nightly-scheduled.yml @@ -212,7 +212,7 @@ jobs: python3 -m pip uninstall -y deepspeed rm -rf DeepSpeed git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | @@ -286,4 +286,4 @@ jobs: with: name: | single-* - multi-* \ No newline at end of file + multi-* diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml index 2ece4388d27c99..ed60c92f6745a8 100644 --- a/.github/workflows/self-past.yml +++ b/.github/workflows/self-past.yml @@ -267,7 +267,7 @@ jobs: python3 -m pip uninstall -y deepspeed rm -rf DeepSpeed git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | @@ -353,4 +353,4 @@ jobs: with: name: | single-* - multi-* \ No newline at end of file + multi-* diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index a6ea5b1e04b942..e6f1f3b3050f7a 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -366,7 +366,7 @@ jobs: working-directory: /workspace run: | python3 -m pip uninstall -y deepspeed - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | @@ -456,7 +456,7 @@ jobs: working-directory: /workspace run: | python3 -m pip uninstall -y deepspeed - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index ef1c4ddaa072f7..3d41a3b95e6c50 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -356,6 +356,63 @@ jobs: name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu + run_tests_torch_deepspeed_gpu: + name: Torch ROCm deepspeed tests + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + needs: setup + container: + image: huggingface/transformers-pytorch-deepspeed-amd-gpu + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /transformers + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_deepspeed_gpu tests/deepspeed tests/extended + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_tests_torch_deepspeed_gpu_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu + run_extract_warnings: name: Extract warnings in CI artifacts runs-on: ubuntu-22.04 @@ -368,7 +425,7 @@ jobs: run_tests_multi_gpu, run_examples_gpu, run_pipelines_torch_gpu, - # run_all_tests_torch_cuda_extensions_gpu + run_tests_torch_deepspeed_gpu ] steps: - name: Checkout transformers @@ -417,7 +474,7 @@ jobs: run_tests_multi_gpu, run_examples_gpu, run_pipelines_torch_gpu, - # run_all_tests_torch_cuda_extensions_gpu, + run_tests_torch_deepspeed_gpu, run_extract_warnings ] steps: diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 8d3c23d01e8018..995df2e07880ac 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -366,7 +366,7 @@ jobs: working-directory: /workspace run: | python3 -m pip uninstall -y deepspeed - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | diff --git a/docker/transformers-pytorch-amd-gpu/Dockerfile b/docker/transformers-pytorch-amd-gpu/Dockerfile index 216ff4c4385548..46ca1a531b4ab4 100644 --- a/docker/transformers-pytorch-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-amd-gpu/Dockerfile @@ -22,7 +22,11 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools ninja git+htt ARG REF=main WORKDIR / + +# Invalidate docker cache from here if new commit is available. +ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF + RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video] RUN python3 -m pip uninstall -y tensorflow flax diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile new file mode 100644 index 00000000000000..1fa384dfa2bc03 --- /dev/null +++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile @@ -0,0 +1,45 @@ +FROM rocm/dev-ubuntu-22.04:5.6 +LABEL maintainer="Hugging Face" + +ARG DEBIAN_FRONTEND=noninteractive +ARG PYTORCH='2.1.1' +ARG TORCH_VISION='0.16.1' +ARG TORCH_AUDIO='2.1.1' +ARG ROCM='5.6' + +RUN apt update && \ + apt install -y --no-install-recommends \ + libaio-dev \ + git \ + # These are required to build deepspeed. + python3-dev \ + python-is-python3 \ + rocrand-dev \ + rocthrust-dev \ + hipsparse-dev \ + hipblas-dev \ + rocblas-dev && \ + apt clean && \ + rm -rf /var/lib/apt/lists/* + +RUN python3 -m pip install --no-cache-dir --upgrade pip ninja "pydantic<2" +RUN python3 -m pip uninstall -y apex torch torchvision torchaudio +RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir + +# Pre-build DeepSpeed, so it's be ready for testing (to avoid timeout) +RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1 + +ARG REF=main +WORKDIR / + +# Invalidate docker cache from here if new commit is available. +ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json +RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF + +RUN python3 -m pip install --no-cache-dir ./transformers[accelerate,testing,sentencepiece,sklearn] + +# When installing in editable mode, `transformers` is not recognized as a package. +# this line must be added in order for python to be aware of transformers. +RUN cd transformers && python3 setup.py develop + +RUN python3 -c "from deepspeed.launcher.runner import main" \ No newline at end of file diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile index 11a64672058522..a7b08a8c60d31d 100644 --- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile @@ -34,7 +34,7 @@ RUN python3 -m pip uninstall -y torch-tensorrt # recompile apex RUN python3 -m pip uninstall -y apex -RUN git clone https://github.com/NVIDIA/apex +# RUN git clone https://github.com/NVIDIA/apex # `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners # TODO: check if there is alternative way to install latest apex # RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check . @@ -44,7 +44,7 @@ RUN python3 -m pip uninstall -y deepspeed # This has to be run (again) inside the GPU VMs running the tests. # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests. # TODO: Find out why test fail. -RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 +RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 2352cf522f29a7..14c8f6703166c9 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -561,8 +561,8 @@ def test_gradient_accumulation(self, stage, dtype): self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5) self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5) - # see the note above how to get identical loss on a small bs - self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=2) + # Relative difference. See the note above how to get identical loss on a small bs + self.assertTrue((no_grad_accum_loss - yes_grad_accum_loss) / (no_grad_accum_loss + 1e-15) <= 1e-3) def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage, dtype): # adapted from TrainerIntegrationCommon.check_saved_checkpoints