From c19c19f94613925b595121eb7effe8a331e81a58 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Mon, 11 Dec 2023 16:33:36 +0100
Subject: [PATCH] Add deepspeed test to amd scheduled CI (#27633)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add deepspeed scheduled test for amd

* fix image

* add dockerfile

* add comment

* enable tests

* trigger

* remove trigger for this branch

* trigger

* change runner env to trigger the docker build image test

* use new docker image

* remove test suffix from docker image tag

* replace test docker image with original image

* push new image

* Trigger

* add back amd tests

* fix typo

* add amd tests back

* fix

* comment until docker image build scheduled test fix

* remove deprecated deepspeed build option

* upgrade torch

* update docker & make tests pass

* Update docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile

* fix

* tmp disable test

* precompile deepspeed to avoid timeout during tests

* fix comment

* trigger deepspeed tests with new image

* comment tests

* trigger

* add sklearn dependency to fix slow tests

* enable back other tests

* final update

---------

Co-authored-by: Felix Marty <felix@hf.co>
Co-authored-by: Félix Marty <9808326+fxmarty@users.noreply.github.com>
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/build-docker-images.yml     | 36 +++++++++++
 .github/workflows/self-nightly-scheduled.yml  |  4 +-
 .github/workflows/self-past.yml               |  4 +-
 .github/workflows/self-push.yml               |  4 +-
 .github/workflows/self-scheduled-amd.yml      | 61 ++++++++++++++++++-
 .github/workflows/self-scheduled.yml          |  2 +-
 .../transformers-pytorch-amd-gpu/Dockerfile   |  4 ++
 .../Dockerfile                                | 45 ++++++++++++++
 .../Dockerfile                                |  4 +-
 tests/deepspeed/test_deepspeed.py             |  4 +-
 10 files changed, 155 insertions(+), 13 deletions(-)
 create mode 100644 docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index b267ad7882d89f..be070a95d3a94f 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -271,3 +271,39 @@ jobs:
             REF=main
           push: true
           tags: huggingface/transformers-tensorflow-gpu
+
+  # latest-pytorch-deepspeed-amd:
+  #   name: "PyTorch + DeepSpeed (AMD) [dev]"
+
+  #   runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
+  #   steps:
+  #     - name: Set up Docker Buildx
+  #       uses: docker/setup-buildx-action@v3
+  #     - name: Check out code
+  #       uses: actions/checkout@v3
+  #     - name: Login to DockerHub
+  #       uses: docker/login-action@v3
+  #       with:
+  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
+  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
+  #     - name: Build and push
+  #       uses: docker/build-push-action@v5
+  #       with:
+  #         context: ./docker/transformers-pytorch-deepspeed-amd-gpu
+  #         build-args: |
+  #           REF=main
+  #         push: true
+  #         tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
+  #     # Push CI images still need to be re-built daily
+  #     -
+  #       name: Build and push (for Push CI) in a daily basis
+  #       # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+  #       # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+  #       if: inputs.image_postfix != '-push-ci'
+  #       uses: docker/build-push-action@v5
+  #       with:
+  #         context: ./docker/transformers-pytorch-deepspeed-amd-gpu
+  #         build-args: |
+  #           REF=main
+  #         push: true
+  #         tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml
index e4b4f7f77cf077..37dc98f340a16d 100644
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@@ -212,7 +212,7 @@ jobs:
           python3 -m pip uninstall -y deepspeed
           rm -rf DeepSpeed
           git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
@@ -286,4 +286,4 @@ jobs:
         with:
           name: |
               single-*
-              multi-*
\ No newline at end of file
+              multi-*
diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml
index 2ece4388d27c99..ed60c92f6745a8 100644
--- a/.github/workflows/self-past.yml
+++ b/.github/workflows/self-past.yml
@@ -267,7 +267,7 @@ jobs:
           python3 -m pip uninstall -y deepspeed
           rm -rf DeepSpeed
           git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
@@ -353,4 +353,4 @@ jobs:
         with:
           name: |
               single-*
-              multi-*
\ No newline at end of file
+              multi-*
diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
index a6ea5b1e04b942..e6f1f3b3050f7a 100644
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -366,7 +366,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
@@ -456,7 +456,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml
index ef1c4ddaa072f7..3d41a3b95e6c50 100644
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -356,6 +356,63 @@ jobs:
           name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
           path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
 
+  run_tests_torch_deepspeed_gpu:
+    name: Torch ROCm deepspeed tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    needs: setup
+    container:
+      image: huggingface/transformers-pytorch-deepspeed-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_deepspeed_gpu tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_tests_torch_deepspeed_gpu_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu
+
   run_extract_warnings:
     name: Extract warnings in CI artifacts
     runs-on: ubuntu-22.04
@@ -368,7 +425,7 @@ jobs:
       run_tests_multi_gpu,
       run_examples_gpu,
       run_pipelines_torch_gpu,
-      # run_all_tests_torch_cuda_extensions_gpu
+      run_tests_torch_deepspeed_gpu
     ]
     steps:
       - name: Checkout transformers
@@ -417,7 +474,7 @@ jobs:
       run_tests_multi_gpu,
       run_examples_gpu,
       run_pipelines_torch_gpu,
-      # run_all_tests_torch_cuda_extensions_gpu,
+      run_tests_torch_deepspeed_gpu,
       run_extract_warnings
     ]
     steps:
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 8d3c23d01e8018..995df2e07880ac 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -366,7 +366,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
diff --git a/docker/transformers-pytorch-amd-gpu/Dockerfile b/docker/transformers-pytorch-amd-gpu/Dockerfile
index 216ff4c4385548..46ca1a531b4ab4 100644
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@@ -22,7 +22,11 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools ninja git+htt
 
 ARG REF=main
 WORKDIR /
+
+# Invalidate docker cache from here if new commit is available.
+ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
 
 RUN python3 -m pip uninstall -y tensorflow flax
diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
new file mode 100644
index 00000000000000..1fa384dfa2bc03
--- /dev/null
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@@ -0,0 +1,45 @@
+FROM rocm/dev-ubuntu-22.04:5.6
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+ARG PYTORCH='2.1.1'
+ARG TORCH_VISION='0.16.1'
+ARG TORCH_AUDIO='2.1.1'
+ARG ROCM='5.6'
+
+RUN apt update && \
+    apt install -y --no-install-recommends \
+    libaio-dev \
+    git \
+    # These are required to build deepspeed.
+    python3-dev \
+    python-is-python3 \
+    rocrand-dev \
+    rocthrust-dev \
+    hipsparse-dev \
+    hipblas-dev \
+    rocblas-dev && \
+    apt clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip ninja "pydantic<2"
+RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
+RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir
+
+# Pre-build DeepSpeed, so it's be ready for testing (to avoid timeout)
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
+
+ARG REF=main
+WORKDIR /
+
+# Invalidate docker cache from here if new commit is available.
+ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
+RUN python3 -m pip install --no-cache-dir ./transformers[accelerate,testing,sentencepiece,sklearn]
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
+
+RUN python3 -c "from deepspeed.launcher.runner import main"
\ No newline at end of file
diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
index 11a64672058522..a7b08a8c60d31d 100644
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -34,7 +34,7 @@ RUN python3 -m pip uninstall -y torch-tensorrt
 
 # recompile apex
 RUN python3 -m pip uninstall -y apex
-RUN git clone https://github.com/NVIDIA/apex
+# RUN git clone https://github.com/NVIDIA/apex
 #  `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
 # TODO: check if there is alternative way to install latest apex
 # RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
@@ -44,7 +44,7 @@ RUN python3 -m pip uninstall -y deepspeed
 # This has to be run (again) inside the GPU VMs running the tests.
 # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
 # TODO: Find out why test fail.
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
 
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 2352cf522f29a7..14c8f6703166c9 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -561,8 +561,8 @@ def test_gradient_accumulation(self, stage, dtype):
         self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5)
         self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5)
 
-        # see the note above how to get identical loss on a small bs
-        self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=2)
+        # Relative difference. See the note above how to get identical loss on a small bs
+        self.assertTrue((no_grad_accum_loss - yes_grad_accum_loss) / (no_grad_accum_loss + 1e-15) <= 1e-3)
 
     def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage, dtype):
         # adapted from TrainerIntegrationCommon.check_saved_checkpoints