From 1e8ce6607b49ca32c009e4e86467df8ae11a84b6 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 21 Nov 2023 15:37:00 +0100
Subject: [PATCH 01/33] add deepspeed scheduled test for amd

---
 .github/workflows/self-scheduled-amd.yml | 60 +++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml
index 17e907e40a5757..774d814883e057 100644
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -356,6 +356,62 @@ jobs:
           name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
           path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
 
+  run_all_tests_torch_rocm_deepspeed_gpu:
+    name: Torch ROCm deepspeed tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    needs: setup
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[deepspeed]
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_rocm_deepspeed_gpu tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_rocm_deepspeed_gpu/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_tests_torch_rocm_deepspeed_gpu_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_rocm_deepspeed_gpu
+
   run_extract_warnings:
     name: Extract warnings in CI artifacts
     runs-on: ubuntu-22.04
@@ -368,7 +424,7 @@ jobs:
       run_tests_multi_gpu,
       run_examples_gpu,
       run_pipelines_torch_gpu,
-      # run_all_tests_torch_cuda_extensions_gpu
+      run_all_tests_torch_rocm_deepspeed_gpu
     ]
     steps:
       - name: Checkout transformers
@@ -417,7 +473,7 @@ jobs:
       run_tests_multi_gpu,
       run_examples_gpu,
       run_pipelines_torch_gpu,
-      # run_all_tests_torch_cuda_extensions_gpu,
+      run_all_tests_torch_rocm_deepspeed_gpu,
       run_extract_warnings
     ]
     steps:

From bf276ed0363fe817d9d11dfb9058b4fc094caa07 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Wed, 22 Nov 2023 00:32:51 +0100
Subject: [PATCH 02/33] fix image

---
 .github/workflows/self-scheduled-amd.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml
index 774d814883e057..0bf9c092c87fd5 100644
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -366,7 +366,7 @@ jobs:
     runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
     needs: setup
     container:
-      image: huggingface/transformers-pytorch-amd-gpu
+      image: huggingface/transformers-pytorch-amd-gpu-test
       options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
       - name: Update clone

From 2cfb53d1d5486e7e841e6460b3a4208d7a6a97ec Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 23 Nov 2023 17:55:33 +0100
Subject: [PATCH 03/33] add dockerfile

---
 .github/workflows/build-docker-images.yml     | 493 +++++++++--------
 .github/workflows/self-scheduled-amd.yml      | 496 +++++++++---------
 .../Dockerfile                                |  19 +
 3 files changed, 531 insertions(+), 477 deletions(-)
 create mode 100644 docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index b267ad7882d89f..6c567566d4d64d 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - build_ci_docker_image*
+      # - run_amd_scheduled_ci_caller_deepspeed_test
   repository_dispatch:
   workflow_call:
     inputs:
@@ -18,118 +19,280 @@ concurrency:
   cancel-in-progress: false
 
 jobs:
-  latest-docker:
-    name: "Latest PyTorch + TensorFlow [dev]"
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-all-latest-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
-      # Push CI images still need to be re-built daily
-      -
-        name: Build and push (for Push CI) in a daily basis
-        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-        if: inputs.image_postfix != '-push-ci'
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-all-latest-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-all-latest-gpu-push-ci
+  # latest-docker:
+  #   name: "Latest PyTorch + TensorFlow [dev]"
+  #   runs-on: ubuntu-22.04
+  #   steps:
+  #     - name: Cleanup disk
+  #       run: |
+  #         sudo ls -l /usr/local/lib/
+  #         sudo ls -l /usr/share/
+  #         sudo du -sh /usr/local/lib/
+  #         sudo du -sh /usr/share/
+  #         sudo rm -rf /usr/local/lib/android
+  #         sudo rm -rf /usr/share/dotnet
+  #         sudo du -sh /usr/local/lib/
+  #         sudo du -sh /usr/share/
+  #     -
+  #       name: Set up Docker Buildx
+  #       uses: docker/setup-buildx-action@v3
+  #     -
+  #       name: Check out code
+  #       uses: actions/checkout@v3
+  #     -
+  #       name: Login to DockerHub
+  #       uses: docker/login-action@v3
+  #       with:
+  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
+  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
+  #     -
+  #       name: Build and push
+  #       uses: docker/build-push-action@v5
+  #       with:
+  #         context: ./docker/transformers-all-latest-gpu
+  #         build-args: |
+  #           REF=main
+  #         push: true
+  #         tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
+  #     # Push CI images still need to be re-built daily
+  #     -
+  #       name: Build and push (for Push CI) in a daily basis
+  #       # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+  #       # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+  #       if: inputs.image_postfix != '-push-ci'
+  #       uses: docker/build-push-action@v5
+  #       with:
+  #         context: ./docker/transformers-all-latest-gpu
+  #         build-args: |
+  #           REF=main
+  #         push: true
+  #         tags: huggingface/transformers-all-latest-gpu-push-ci
+
+  # latest-torch-deepspeed-docker:
+  #   name: "Latest PyTorch + DeepSpeed"
+  #   runs-on: ubuntu-22.04
+  #   steps:
+  #     - name: Cleanup disk
+  #       run: |
+  #         sudo ls -l /usr/local/lib/
+  #         sudo ls -l /usr/share/
+  #         sudo du -sh /usr/local/lib/
+  #         sudo du -sh /usr/share/
+  #         sudo rm -rf /usr/local/lib/android
+  #         sudo rm -rf /usr/share/dotnet
+  #         sudo du -sh /usr/local/lib/
+  #         sudo du -sh /usr/share/
+  #     -
+  #       name: Set up Docker Buildx
+  #       uses: docker/setup-buildx-action@v3
+  #     -
+  #       name: Check out code
+  #       uses: actions/checkout@v3
+  #     -
+  #       name: Login to DockerHub
+  #       uses: docker/login-action@v3
+  #       with:
+  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
+  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
+  #     -
+  #       name: Build and push
+  #       uses: docker/build-push-action@v5
+  #       with:
+  #         context: ./docker/transformers-pytorch-deepspeed-latest-gpu
+  #         build-args: |
+  #           REF=main
+  #         push: true
+  #         tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}
+
+  # # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
+  # latest-torch-deepspeed-docker-for-push-ci-daily-build:
+  #   name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
+  #   runs-on: ubuntu-22.04
+  #   steps:
+  #     - name: Cleanup disk
+  #       run: |
+  #         sudo ls -l /usr/local/lib/
+  #         sudo ls -l /usr/share/
+  #         sudo du -sh /usr/local/lib/
+  #         sudo du -sh /usr/share/
+  #         sudo rm -rf /usr/local/lib/android
+  #         sudo rm -rf /usr/share/dotnet
+  #         sudo du -sh /usr/local/lib/
+  #         sudo du -sh /usr/share/
+  #     -
+  #       name: Set up Docker Buildx
+  #       uses: docker/setup-buildx-action@v3
+  #     -
+  #       name: Check out code
+  #       uses: actions/checkout@v3
+  #     -
+  #       name: Login to DockerHub
+  #       uses: docker/login-action@v3
+  #       with:
+  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
+  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
+  #     # Push CI images still need to be re-built daily
+  #     -
+  #       name: Build and push (for Push CI) in a daily basis
+  #       # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+  #       # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+  #       if: inputs.image_postfix != '-push-ci'
+  #       uses: docker/build-push-action@v5
+  #       with:
+  #         context: ./docker/transformers-pytorch-deepspeed-latest-gpu
+  #         build-args: |
+  #           REF=main
+  #         push: true
+  #         tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
+
+  # doc-builder:
+  #   name: "Doc builder"
+  #   # Push CI doesn't need this image
+  #   if: inputs.image_postfix != '-push-ci'
+  #   runs-on: ubuntu-22.04
+  #   steps:
+  #     -
+  #       name: Set up Docker Buildx
+  #       uses: docker/setup-buildx-action@v3
+  #     -
+  #       name: Check out code
+  #       uses: actions/checkout@v3
+  #     -
+  #       name: Login to DockerHub
+  #       uses: docker/login-action@v3
+  #       with:
+  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
+  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
+  #     -
+  #       name: Build and push
+  #       uses: docker/build-push-action@v5
+  #       with:
+  #         context: ./docker/transformers-doc-builder
+  #         push: true
+  #         tags: huggingface/transformers-doc-builder
+
+  # latest-pytorch:
+  #   name: "Latest PyTorch [dev]"
+  #   # Push CI doesn't need this image
+  #   if: inputs.image_postfix != '-push-ci'
+  #   runs-on: ubuntu-22.04
+  #   steps:
+  #     - name: Cleanup disk
+  #       run: |
+  #         sudo ls -l /usr/local/lib/
+  #         sudo ls -l /usr/share/
+  #         sudo du -sh /usr/local/lib/
+  #         sudo du -sh /usr/share/
+  #         sudo rm -rf /usr/local/lib/android
+  #         sudo rm -rf /usr/share/dotnet
+  #         sudo du -sh /usr/local/lib/
+  #         sudo du -sh /usr/share/
+  #     -
+  #       name: Set up Docker Buildx
+  #       uses: docker/setup-buildx-action@v3
+  #     -
+  #       name: Check out code
+  #       uses: actions/checkout@v3
+  #     -
+  #       name: Login to DockerHub
+  #       uses: docker/login-action@v3
+  #       with:
+  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
+  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
+  #     -
+  #       name: Build and push
+  #       uses: docker/build-push-action@v5
+  #       with:
+  #         context: ./docker/transformers-pytorch-gpu
+  #         build-args: |
+  #           REF=main
+  #         push: true
+  #         tags: huggingface/transformers-pytorch-gpu
+
+  # latest-pytorch-amd:
+  #   name: "Latest PyTorch (AMD) [dev]"
+  #   runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
+  #   steps:
+  #     - name: Set up Docker Buildx
+  #       uses: docker/setup-buildx-action@v3
+  #     - name: Check out code
+  #       uses: actions/checkout@v3
+  #     - name: Login to DockerHub
+  #       uses: docker/login-action@v3
+  #       with:
+  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
+  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
+  #     - name: Build and push
+  #       uses: docker/build-push-action@v5
+  #       with:
+  #         context: ./docker/transformers-pytorch-amd-gpu
+  #         build-args: |
+  #           REF=main
+  #         push: true
+  #         tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
+  #     # Push CI images still need to be re-built daily
+  #     - name: Build and push (for Push CI) in a daily basis
+  #       # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+  #       # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+  #       if: inputs.image_postfix != '-push-ci'
+  #       uses: docker/build-push-action@v5
+  #       with:
+  #         context: ./docker/transformers-pytorch-amd-gpu
+  #         build-args: |
+  #           REF=main
+  #         push: true
+  #         tags: huggingface/transformers-pytorch-amd-gpu-push-ci
 
-  latest-torch-deepspeed-docker:
-    name: "Latest PyTorch + DeepSpeed"
-    runs-on: ubuntu-22.04
+  # latest-tensorflow:
+  #   name: "Latest TensorFlow [dev]"
+  #   # Push CI doesn't need this image
+  #   if: inputs.image_postfix != '-push-ci'
+  #   runs-on: ubuntu-22.04
+  #   steps:
+  #     -
+  #       name: Set up Docker Buildx
+  #       uses: docker/setup-buildx-action@v3
+  #     -
+  #       name: Check out code
+  #       uses: actions/checkout@v3
+  #     -
+  #       name: Login to DockerHub
+  #       uses: docker/login-action@v3
+  #       with:
+  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
+  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
+  #     -
+  #       name: Build and push
+  #       uses: docker/build-push-action@v5
+  #       with:
+  #         context: ./docker/transformers-tensorflow-gpu
+  #         build-args: |
+  #           REF=main
+  #         push: true
+  #         tags: huggingface/transformers-tensorflow-gpu
+
+  latest-pytorch-deepspeed-amd:
+    name: "PyTorch + DeepSpeed (AMD)"
+
+    runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
     steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-      -
-        name: Set up Docker Buildx
+      - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
+      - name: Check out code
         uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
+      - name: Login to DockerHub
         uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
+      - name: Build and push
         uses: docker/build-push-action@v5
         with:
-          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
+          context: ./docker/transformers-pytorch-deepspeed-amd-gpu
           build-args: |
             REF=main
           push: true
-          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}
-
-  # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
-  latest-torch-deepspeed-docker-for-push-ci-daily-build:
-    name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}-test
       # Push CI images still need to be re-built daily
       -
         name: Build and push (for Push CI) in a daily basis
@@ -138,136 +301,8 @@ jobs:
         if: inputs.image_postfix != '-push-ci'
         uses: docker/build-push-action@v5
         with:
-          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
-
-  doc-builder:
-    name: "Doc builder"
-    # Push CI doesn't need this image
-    if: inputs.image_postfix != '-push-ci'
-    runs-on: ubuntu-22.04
-    steps:
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-doc-builder
-          push: true
-          tags: huggingface/transformers-doc-builder
-
-  latest-pytorch:
-    name: "Latest PyTorch [dev]"
-    # Push CI doesn't need this image
-    if: inputs.image_postfix != '-push-ci'
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-pytorch-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-pytorch-gpu
-
-# Need to be fixed with the help from Guillaume.
-#  latest-pytorch-amd:
-#    name: "Latest PyTorch (AMD) [dev]"
-#    runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
-#    steps:
-#      - name: Set up Docker Buildx
-#        uses: docker/setup-buildx-action@v3
-#      - name: Check out code
-#        uses: actions/checkout@v3
-#      - name: Login to DockerHub
-#        uses: docker/login-action@v3
-#        with:
-#          username: ${{ secrets.DOCKERHUB_USERNAME }}
-#          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-#      - name: Build and push
-#        uses: docker/build-push-action@v5
-#        with:
-#          context: ./docker/transformers-pytorch-amd-gpu
-#          build-args: |
-#            REF=main
-#          push: true
-#          tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
-#      # Push CI images still need to be re-built daily
-#      -
-#        name: Build and push (for Push CI) in a daily basis
-#        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-#        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-#        if: inputs.image_postfix != '-push-ci'
-#        uses: docker/build-push-action@v5
-#        with:
-#          context: ./docker/transformers-pytorch-amd-gpu
-#          build-args: |
-#            REF=main
-#          push: true
-#          tags: huggingface/transformers-pytorch-amd-gpu-push-ci
-
-  latest-tensorflow:
-    name: "Latest TensorFlow [dev]"
-    # Push CI doesn't need this image
-    if: inputs.image_postfix != '-push-ci'
-    runs-on: ubuntu-22.04
-    steps:
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-tensorflow-gpu
+          context: ./docker/transformers-pytorch-deepspeed-amd-gpu
           build-args: |
             REF=main
           push: true
-          tags: huggingface/transformers-tensorflow-gpu
+          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci-test
\ No newline at end of file
diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml
index 0bf9c092c87fd5..2cc44f553f4944 100644
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -107,254 +107,254 @@ jobs:
         run: |
           python3 utils/print_env.py
 
-  run_tests_single_gpu:
-    name: Single GPU tests
-    strategy:
-      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [single-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-    container:
-      image: huggingface/transformers-pytorch-amd-gpu
-      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
-    steps:
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-        # set the artifact folder names (because the character `/` is not allowed).
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: ROCM-SMI
-        run: |
-          rocm-smi
-      - name: ROCM-INFO
-        run: |
-          rocminfo  | grep "Agent" -A 14
-      - name: Show ROCR environment
-        run: |
-          echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
-
-  run_tests_multi_gpu:
-    name: Multi GPU tests
-    strategy:
-      max-parallel: 1
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-    container:
-      image: huggingface/transformers-pytorch-amd-gpu
-      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
-    steps:
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-        # set the artifact folder names (because the character `/` is not allowed).
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: ROCM-SMI
-        run: |
-          rocm-smi
-      - name: ROCM-INFO
-        run: |
-          rocminfo  | grep "Agent" -A 14
-      - name: Show ROCR environment
-        run: |
-          echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
-
-  run_examples_gpu:
-    name: Examples tests
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [single-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-    container:
-      image: huggingface/transformers-pytorch-amd-gpu
-      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
-    steps:
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: ROCM-SMI
-        run: |
-          rocm-smi
-      - name: ROCM-INFO
-        run: |
-          rocminfo  | grep "Agent" -A 14
-      - name: Show ROCR environment
-        run: |
-          echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run examples tests on GPU
-        working-directory: /transformers
-        run: |
-          pip install -r examples/pytorch/_tests_requirements.txt
-          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: ${{ matrix.machine_type }}_run_examples_gpu
-          path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
-
-  run_pipelines_torch_gpu:
-    name: PyTorch pipelines tests
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-    container:
-      image: huggingface/transformers-pytorch-amd-gpu
-      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
-    steps:
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: ROCM-SMI
-        run: |
-          rocm-smi
-      - name: ROCM-INFO
-        run: |
-          rocminfo  | grep "Agent" -A 14
-      - name: Show ROCR environment
-        run: |
-          echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all pipeline tests on GPU
-        working-directory: /transformers
-        run: |
-          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
+  # run_tests_single_gpu:
+  #   name: Single GPU tests
+  #   strategy:
+  #     max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
+  #     fail-fast: false
+  #     matrix:
+  #       folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+  #       machine_type: [single-gpu]
+  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+  #   container:
+  #     image: huggingface/transformers-pytorch-amd-gpu
+  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+  #   needs: setup
+  #   steps:
+  #     - name: Echo folder ${{ matrix.folders }}
+  #       shell: bash
+  #       # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+  #       # set the artifact folder names (because the character `/` is not allowed).
+  #       run: |
+  #         echo "${{ matrix.folders }}"
+  #         matrix_folders=${{ matrix.folders }}
+  #         matrix_folders=${matrix_folders/'models/'/'models_'}
+  #         echo "$matrix_folders"
+  #         echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+  #     - name: Update clone
+  #       working-directory: /transformers
+  #       run: git fetch && git checkout ${{ github.sha }}
+
+  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+  #       working-directory: /transformers
+  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+  #     - name: ROCM-SMI
+  #       run: |
+  #         rocm-smi
+  #     - name: ROCM-INFO
+  #       run: |
+  #         rocminfo  | grep "Agent" -A 14
+  #     - name: Show ROCR environment
+  #       run: |
+  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  #     - name: Environment
+  #       working-directory: /transformers
+  #       run: |
+  #         python3 utils/print_env.py
+
+  #     - name: Show installed libraries and their versions
+  #       working-directory: /transformers
+  #       run: pip freeze
+
+  #     - name: Run all tests on GPU
+  #       working-directory: /transformers
+  #       run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+
+  #     - name: Failure short reports
+  #       if: ${{ failure() }}
+  #       continue-on-error: true
+  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+  #     - name: Test suite reports artifacts
+  #       if: ${{ always() }}
+  #       uses: actions/upload-artifact@v3
+  #       with:
+  #         name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  # run_tests_multi_gpu:
+  #   name: Multi GPU tests
+  #   strategy:
+  #     max-parallel: 1
+  #     fail-fast: false
+  #     matrix:
+  #       folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+  #       machine_type: [multi-gpu]
+  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+  #   container:
+  #     image: huggingface/transformers-pytorch-amd-gpu
+  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+  #   needs: setup
+  #   steps:
+  #     - name: Echo folder ${{ matrix.folders }}
+  #       shell: bash
+  #       # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+  #       # set the artifact folder names (because the character `/` is not allowed).
+  #       run: |
+  #         echo "${{ matrix.folders }}"
+  #         matrix_folders=${{ matrix.folders }}
+  #         matrix_folders=${matrix_folders/'models/'/'models_'}
+  #         echo "$matrix_folders"
+  #         echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+  #     - name: Update clone
+  #       working-directory: /transformers
+  #       run: git fetch && git checkout ${{ github.sha }}
+
+  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+  #       working-directory: /transformers
+  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+  #     - name: ROCM-SMI
+  #       run: |
+  #         rocm-smi
+  #     - name: ROCM-INFO
+  #       run: |
+  #         rocminfo  | grep "Agent" -A 14
+  #     - name: Show ROCR environment
+  #       run: |
+  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  #     - name: Environment
+  #       working-directory: /transformers
+  #       run: |
+  #         python3 utils/print_env.py
+
+  #     - name: Show installed libraries and their versions
+  #       working-directory: /transformers
+  #       run: pip freeze
+
+  #     - name: Run all tests on GPU
+  #       working-directory: /transformers
+  #       run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+
+  #     - name: Failure short reports
+  #       if: ${{ failure() }}
+  #       continue-on-error: true
+  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+  #     - name: Test suite reports artifacts
+  #       if: ${{ always() }}
+  #       uses: actions/upload-artifact@v3
+  #       with:
+  #         name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  # run_examples_gpu:
+  #   name: Examples tests
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       machine_type: [single-gpu]
+  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+  #   container:
+  #     image: huggingface/transformers-pytorch-amd-gpu
+  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+  #   needs: setup
+  #   steps:
+  #     - name: Update clone
+  #       working-directory: /transformers
+  #       run: git fetch && git checkout ${{ github.sha }}
+
+  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+  #       working-directory: /transformers
+  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+  #     - name: ROCM-SMI
+  #       run: |
+  #         rocm-smi
+  #     - name: ROCM-INFO
+  #       run: |
+  #         rocminfo  | grep "Agent" -A 14
+  #     - name: Show ROCR environment
+  #       run: |
+  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  #     - name: Environment
+  #       working-directory: /transformers
+  #       run: |
+  #         python3 utils/print_env.py
+
+  #     - name: Show installed libraries and their versions
+  #       working-directory: /transformers
+  #       run: pip freeze
+
+  #     - name: Run examples tests on GPU
+  #       working-directory: /transformers
+  #       run: |
+  #         pip install -r examples/pytorch/_tests_requirements.txt
+  #         python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
+
+  #     - name: Failure short reports
+  #       if: ${{ failure() }}
+  #       continue-on-error: true
+  #       run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
+
+  #     - name: Test suite reports artifacts
+  #       if: ${{ always() }}
+  #       uses: actions/upload-artifact@v3
+  #       with:
+  #         name: ${{ matrix.machine_type }}_run_examples_gpu
+  #         path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
+
+  # run_pipelines_torch_gpu:
+  #   name: PyTorch pipelines tests
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       machine_type: [single-gpu, multi-gpu]
+  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+  #   container:
+  #     image: huggingface/transformers-pytorch-amd-gpu
+  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+  #   needs: setup
+  #   steps:
+  #     - name: Update clone
+  #       working-directory: /transformers
+  #       run: git fetch && git checkout ${{ github.sha }}
+
+  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+  #       working-directory: /transformers
+  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+  #     - name: ROCM-SMI
+  #       run: |
+  #         rocm-smi
+  #     - name: ROCM-INFO
+  #       run: |
+  #         rocminfo  | grep "Agent" -A 14
+  #     - name: Show ROCR environment
+  #       run: |
+  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  #     - name: Environment
+  #       working-directory: /transformers
+  #       run: |
+  #         python3 utils/print_env.py
+
+  #     - name: Show installed libraries and their versions
+  #       working-directory: /transformers
+  #       run: pip freeze
+
+  #     - name: Run all pipeline tests on GPU
+  #       working-directory: /transformers
+  #       run: |
+  #         python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
+
+  #     - name: Failure short reports
+  #       if: ${{ failure() }}
+  #       continue-on-error: true
+  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
+
+  #     - name: Test suite reports artifacts
+  #       if: ${{ always() }}
+  #       uses: actions/upload-artifact@v3
+  #       with:
+  #         name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
+  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
 
   run_all_tests_torch_rocm_deepspeed_gpu:
     name: Torch ROCm deepspeed tests
diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
new file mode 100644
index 00000000000000..a5752ebbbeef72
--- /dev/null
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@@ -0,0 +1,19 @@
+FROM rocm/deepspeed:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1_DeepSpeed
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# ARG PYTORCH='2.0.1'
+# ARG ROCM='5.7'
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+ARG REF=main
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
+
+RUN python3 -c "from deepspeed.launcher.runner import main"
\ No newline at end of file

From 5a9a5296adc6d11aa4500be96f256b21cd0832a1 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 23 Nov 2023 19:18:13 +0100
Subject: [PATCH 04/33] add comment

---
 .github/workflows/self-scheduled-amd.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml
index 2cc44f553f4944..7fe82bda8c9926 100644
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -366,7 +366,7 @@ jobs:
     runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
     needs: setup
     container:
-      image: huggingface/transformers-pytorch-amd-gpu-test
+      image: huggingface/transformers-pytorch-amd-gpu-test # replace with huggingface/transformers-pytorch-deepspeed-amd-gpu/
       options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
       - name: Update clone

From af46e872c66fdb3ac80ce3ca14017d88483e201f Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 23 Nov 2023 19:25:14 +0100
Subject: [PATCH 05/33] enable tests

---
 .github/workflows/self-scheduled-amd.yml | 496 +++++++++++------------
 1 file changed, 248 insertions(+), 248 deletions(-)

diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml
index 7fe82bda8c9926..410afbd635fa05 100644
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -107,254 +107,254 @@ jobs:
         run: |
           python3 utils/print_env.py
 
-  # run_tests_single_gpu:
-  #   name: Single GPU tests
-  #   strategy:
-  #     max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
-  #     fail-fast: false
-  #     matrix:
-  #       folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-  #       machine_type: [single-gpu]
-  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-  #   container:
-  #     image: huggingface/transformers-pytorch-amd-gpu
-  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-  #   needs: setup
-  #   steps:
-  #     - name: Echo folder ${{ matrix.folders }}
-  #       shell: bash
-  #       # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-  #       # set the artifact folder names (because the character `/` is not allowed).
-  #       run: |
-  #         echo "${{ matrix.folders }}"
-  #         matrix_folders=${{ matrix.folders }}
-  #         matrix_folders=${matrix_folders/'models/'/'models_'}
-  #         echo "$matrix_folders"
-  #         echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-  #     - name: Update clone
-  #       working-directory: /transformers
-  #       run: git fetch && git checkout ${{ github.sha }}
-
-  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-  #       working-directory: /transformers
-  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-  #     - name: ROCM-SMI
-  #       run: |
-  #         rocm-smi
-  #     - name: ROCM-INFO
-  #       run: |
-  #         rocminfo  | grep "Agent" -A 14
-  #     - name: Show ROCR environment
-  #       run: |
-  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-  #     - name: Environment
-  #       working-directory: /transformers
-  #       run: |
-  #         python3 utils/print_env.py
-
-  #     - name: Show installed libraries and their versions
-  #       working-directory: /transformers
-  #       run: pip freeze
-
-  #     - name: Run all tests on GPU
-  #       working-directory: /transformers
-  #       run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
-
-  #     - name: Failure short reports
-  #       if: ${{ failure() }}
-  #       continue-on-error: true
-  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
-
-  #     - name: Test suite reports artifacts
-  #       if: ${{ always() }}
-  #       uses: actions/upload-artifact@v3
-  #       with:
-  #         name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
-  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
-
-  # run_tests_multi_gpu:
-  #   name: Multi GPU tests
-  #   strategy:
-  #     max-parallel: 1
-  #     fail-fast: false
-  #     matrix:
-  #       folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-  #       machine_type: [multi-gpu]
-  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-  #   container:
-  #     image: huggingface/transformers-pytorch-amd-gpu
-  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-  #   needs: setup
-  #   steps:
-  #     - name: Echo folder ${{ matrix.folders }}
-  #       shell: bash
-  #       # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-  #       # set the artifact folder names (because the character `/` is not allowed).
-  #       run: |
-  #         echo "${{ matrix.folders }}"
-  #         matrix_folders=${{ matrix.folders }}
-  #         matrix_folders=${matrix_folders/'models/'/'models_'}
-  #         echo "$matrix_folders"
-  #         echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-  #     - name: Update clone
-  #       working-directory: /transformers
-  #       run: git fetch && git checkout ${{ github.sha }}
-
-  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-  #       working-directory: /transformers
-  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-  #     - name: ROCM-SMI
-  #       run: |
-  #         rocm-smi
-  #     - name: ROCM-INFO
-  #       run: |
-  #         rocminfo  | grep "Agent" -A 14
-  #     - name: Show ROCR environment
-  #       run: |
-  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-  #     - name: Environment
-  #       working-directory: /transformers
-  #       run: |
-  #         python3 utils/print_env.py
-
-  #     - name: Show installed libraries and their versions
-  #       working-directory: /transformers
-  #       run: pip freeze
-
-  #     - name: Run all tests on GPU
-  #       working-directory: /transformers
-  #       run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
-
-  #     - name: Failure short reports
-  #       if: ${{ failure() }}
-  #       continue-on-error: true
-  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
-
-  #     - name: Test suite reports artifacts
-  #       if: ${{ always() }}
-  #       uses: actions/upload-artifact@v3
-  #       with:
-  #         name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
-  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
-
-  # run_examples_gpu:
-  #   name: Examples tests
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       machine_type: [single-gpu]
-  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-  #   container:
-  #     image: huggingface/transformers-pytorch-amd-gpu
-  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-  #   needs: setup
-  #   steps:
-  #     - name: Update clone
-  #       working-directory: /transformers
-  #       run: git fetch && git checkout ${{ github.sha }}
-
-  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-  #       working-directory: /transformers
-  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-  #     - name: ROCM-SMI
-  #       run: |
-  #         rocm-smi
-  #     - name: ROCM-INFO
-  #       run: |
-  #         rocminfo  | grep "Agent" -A 14
-  #     - name: Show ROCR environment
-  #       run: |
-  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-  #     - name: Environment
-  #       working-directory: /transformers
-  #       run: |
-  #         python3 utils/print_env.py
-
-  #     - name: Show installed libraries and their versions
-  #       working-directory: /transformers
-  #       run: pip freeze
-
-  #     - name: Run examples tests on GPU
-  #       working-directory: /transformers
-  #       run: |
-  #         pip install -r examples/pytorch/_tests_requirements.txt
-  #         python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
-
-  #     - name: Failure short reports
-  #       if: ${{ failure() }}
-  #       continue-on-error: true
-  #       run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
-
-  #     - name: Test suite reports artifacts
-  #       if: ${{ always() }}
-  #       uses: actions/upload-artifact@v3
-  #       with:
-  #         name: ${{ matrix.machine_type }}_run_examples_gpu
-  #         path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
-
-  # run_pipelines_torch_gpu:
-  #   name: PyTorch pipelines tests
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       machine_type: [single-gpu, multi-gpu]
-  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-  #   container:
-  #     image: huggingface/transformers-pytorch-amd-gpu
-  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-  #   needs: setup
-  #   steps:
-  #     - name: Update clone
-  #       working-directory: /transformers
-  #       run: git fetch && git checkout ${{ github.sha }}
-
-  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-  #       working-directory: /transformers
-  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-  #     - name: ROCM-SMI
-  #       run: |
-  #         rocm-smi
-  #     - name: ROCM-INFO
-  #       run: |
-  #         rocminfo  | grep "Agent" -A 14
-  #     - name: Show ROCR environment
-  #       run: |
-  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-  #     - name: Environment
-  #       working-directory: /transformers
-  #       run: |
-  #         python3 utils/print_env.py
-
-  #     - name: Show installed libraries and their versions
-  #       working-directory: /transformers
-  #       run: pip freeze
-
-  #     - name: Run all pipeline tests on GPU
-  #       working-directory: /transformers
-  #       run: |
-  #         python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
-
-  #     - name: Failure short reports
-  #       if: ${{ failure() }}
-  #       continue-on-error: true
-  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
-
-  #     - name: Test suite reports artifacts
-  #       if: ${{ always() }}
-  #       uses: actions/upload-artifact@v3
-  #       with:
-  #         name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
-  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
+  run_tests_single_gpu:
+    name: Single GPU tests
+    strategy:
+      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+        machine_type: [single-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  run_tests_multi_gpu:
+    name: Multi GPU tests
+    strategy:
+      max-parallel: 1
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+        machine_type: [multi-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  run_examples_gpu:
+    name: Examples tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run examples tests on GPU
+        working-directory: /transformers
+        run: |
+          pip install -r examples/pytorch/_tests_requirements.txt
+          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_examples_gpu
+          path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
+
+  run_pipelines_torch_gpu:
+    name: PyTorch pipelines tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all pipeline tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
 
   run_all_tests_torch_rocm_deepspeed_gpu:
     name: Torch ROCm deepspeed tests

From c29d2492a5dafd60d5359a6f5b3138cfc3e41c6d Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Mon, 27 Nov 2023 18:54:31 +0100
Subject: [PATCH 06/33] trigger

---
 .github/workflows/build-docker-images.yml     | 271 +-----------------
 .../Dockerfile                                |   9 +-
 2 files changed, 10 insertions(+), 270 deletions(-)

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index 6c567566d4d64d..201ea127ba07f2 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches:
       - build_ci_docker_image*
-      # - run_amd_scheduled_ci_caller_deepspeed_test
+      - run_amd_scheduled_ci_caller_deepspeed_test
   repository_dispatch:
   workflow_call:
     inputs:
@@ -19,262 +19,10 @@ concurrency:
   cancel-in-progress: false
 
 jobs:
-  # latest-docker:
-  #   name: "Latest PyTorch + TensorFlow [dev]"
-  #   runs-on: ubuntu-22.04
-  #   steps:
-  #     - name: Cleanup disk
-  #       run: |
-  #         sudo ls -l /usr/local/lib/
-  #         sudo ls -l /usr/share/
-  #         sudo du -sh /usr/local/lib/
-  #         sudo du -sh /usr/share/
-  #         sudo rm -rf /usr/local/lib/android
-  #         sudo rm -rf /usr/share/dotnet
-  #         sudo du -sh /usr/local/lib/
-  #         sudo du -sh /usr/share/
-  #     -
-  #       name: Set up Docker Buildx
-  #       uses: docker/setup-buildx-action@v3
-  #     -
-  #       name: Check out code
-  #       uses: actions/checkout@v3
-  #     -
-  #       name: Login to DockerHub
-  #       uses: docker/login-action@v3
-  #       with:
-  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
-  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     -
-  #       name: Build and push
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-all-latest-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
-  #     # Push CI images still need to be re-built daily
-  #     -
-  #       name: Build and push (for Push CI) in a daily basis
-  #       # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-  #       # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-  #       if: inputs.image_postfix != '-push-ci'
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-all-latest-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-all-latest-gpu-push-ci
-
-  # latest-torch-deepspeed-docker:
-  #   name: "Latest PyTorch + DeepSpeed"
-  #   runs-on: ubuntu-22.04
-  #   steps:
-  #     - name: Cleanup disk
-  #       run: |
-  #         sudo ls -l /usr/local/lib/
-  #         sudo ls -l /usr/share/
-  #         sudo du -sh /usr/local/lib/
-  #         sudo du -sh /usr/share/
-  #         sudo rm -rf /usr/local/lib/android
-  #         sudo rm -rf /usr/share/dotnet
-  #         sudo du -sh /usr/local/lib/
-  #         sudo du -sh /usr/share/
-  #     -
-  #       name: Set up Docker Buildx
-  #       uses: docker/setup-buildx-action@v3
-  #     -
-  #       name: Check out code
-  #       uses: actions/checkout@v3
-  #     -
-  #       name: Login to DockerHub
-  #       uses: docker/login-action@v3
-  #       with:
-  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
-  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     -
-  #       name: Build and push
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-pytorch-deepspeed-latest-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}
-
-  # # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
-  # latest-torch-deepspeed-docker-for-push-ci-daily-build:
-  #   name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
-  #   runs-on: ubuntu-22.04
-  #   steps:
-  #     - name: Cleanup disk
-  #       run: |
-  #         sudo ls -l /usr/local/lib/
-  #         sudo ls -l /usr/share/
-  #         sudo du -sh /usr/local/lib/
-  #         sudo du -sh /usr/share/
-  #         sudo rm -rf /usr/local/lib/android
-  #         sudo rm -rf /usr/share/dotnet
-  #         sudo du -sh /usr/local/lib/
-  #         sudo du -sh /usr/share/
-  #     -
-  #       name: Set up Docker Buildx
-  #       uses: docker/setup-buildx-action@v3
-  #     -
-  #       name: Check out code
-  #       uses: actions/checkout@v3
-  #     -
-  #       name: Login to DockerHub
-  #       uses: docker/login-action@v3
-  #       with:
-  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
-  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     # Push CI images still need to be re-built daily
-  #     -
-  #       name: Build and push (for Push CI) in a daily basis
-  #       # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-  #       # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-  #       if: inputs.image_postfix != '-push-ci'
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-pytorch-deepspeed-latest-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
-
-  # doc-builder:
-  #   name: "Doc builder"
-  #   # Push CI doesn't need this image
-  #   if: inputs.image_postfix != '-push-ci'
-  #   runs-on: ubuntu-22.04
-  #   steps:
-  #     -
-  #       name: Set up Docker Buildx
-  #       uses: docker/setup-buildx-action@v3
-  #     -
-  #       name: Check out code
-  #       uses: actions/checkout@v3
-  #     -
-  #       name: Login to DockerHub
-  #       uses: docker/login-action@v3
-  #       with:
-  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
-  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     -
-  #       name: Build and push
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-doc-builder
-  #         push: true
-  #         tags: huggingface/transformers-doc-builder
-
-  # latest-pytorch:
-  #   name: "Latest PyTorch [dev]"
-  #   # Push CI doesn't need this image
-  #   if: inputs.image_postfix != '-push-ci'
-  #   runs-on: ubuntu-22.04
-  #   steps:
-  #     - name: Cleanup disk
-  #       run: |
-  #         sudo ls -l /usr/local/lib/
-  #         sudo ls -l /usr/share/
-  #         sudo du -sh /usr/local/lib/
-  #         sudo du -sh /usr/share/
-  #         sudo rm -rf /usr/local/lib/android
-  #         sudo rm -rf /usr/share/dotnet
-  #         sudo du -sh /usr/local/lib/
-  #         sudo du -sh /usr/share/
-  #     -
-  #       name: Set up Docker Buildx
-  #       uses: docker/setup-buildx-action@v3
-  #     -
-  #       name: Check out code
-  #       uses: actions/checkout@v3
-  #     -
-  #       name: Login to DockerHub
-  #       uses: docker/login-action@v3
-  #       with:
-  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
-  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     -
-  #       name: Build and push
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-pytorch-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-pytorch-gpu
-
-  # latest-pytorch-amd:
-  #   name: "Latest PyTorch (AMD) [dev]"
-  #   runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
-  #   steps:
-  #     - name: Set up Docker Buildx
-  #       uses: docker/setup-buildx-action@v3
-  #     - name: Check out code
-  #       uses: actions/checkout@v3
-  #     - name: Login to DockerHub
-  #       uses: docker/login-action@v3
-  #       with:
-  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
-  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     - name: Build and push
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-pytorch-amd-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
-  #     # Push CI images still need to be re-built daily
-  #     - name: Build and push (for Push CI) in a daily basis
-  #       # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-  #       # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-  #       if: inputs.image_postfix != '-push-ci'
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-pytorch-amd-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-pytorch-amd-gpu-push-ci
-
-  # latest-tensorflow:
-  #   name: "Latest TensorFlow [dev]"
-  #   # Push CI doesn't need this image
-  #   if: inputs.image_postfix != '-push-ci'
-  #   runs-on: ubuntu-22.04
-  #   steps:
-  #     -
-  #       name: Set up Docker Buildx
-  #       uses: docker/setup-buildx-action@v3
-  #     -
-  #       name: Check out code
-  #       uses: actions/checkout@v3
-  #     -
-  #       name: Login to DockerHub
-  #       uses: docker/login-action@v3
-  #       with:
-  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
-  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     -
-  #       name: Build and push
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-tensorflow-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-tensorflow-gpu
-
   latest-pytorch-deepspeed-amd:
-    name: "PyTorch + DeepSpeed (AMD)"
+    name: "PyTorch + DeepSpeed (AMD) [dev]"
 
-    runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
+    runs-on: [self-hosted, yih-dar-shieh-debug-daily]
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
@@ -293,16 +41,3 @@ jobs:
             REF=main
           push: true
           tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}-test
-      # Push CI images still need to be re-built daily
-      -
-        name: Build and push (for Push CI) in a daily basis
-        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-        if: inputs.image_postfix != '-push-ci'
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-pytorch-deepspeed-amd-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci-test
\ No newline at end of file
diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
index a5752ebbbeef72..574951727c1835 100644
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@@ -2,13 +2,18 @@ FROM rocm/deepspeed:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1_DeepSpeed
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
+ARG PYTORCH='2.0.1'
+ARG ROCM='5.7'
 
-# ARG PYTORCH='2.0.1'
-# ARG ROCM='5.7'
+RUN apt update && \
+    apt install -y --no-install-recommends libaio-dev && \
+    apt clean && \
+    rm -rf /var/lib/apt/lists/*
 
 RUN python3 -m pip install --no-cache-dir --upgrade pip
 
 ARG REF=main
+WORKDIR /
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
 

From a0c3dafbeee98952996e27ea53734ab736a319e2 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Mon, 27 Nov 2023 19:23:02 +0100
Subject: [PATCH 07/33] remove trigger for this branch

---
 .github/workflows/build-docker-images.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index 201ea127ba07f2..c5d5d0402d56a5 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches:
       - build_ci_docker_image*
-      - run_amd_scheduled_ci_caller_deepspeed_test
+      # - run_amd_scheduled_ci_caller_deepspeed_test
   repository_dispatch:
   workflow_call:
     inputs:

From 4cb9d6f54fdc225dd213e951026dd9ce1eb159fc Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 28 Nov 2023 12:04:16 +0100
Subject: [PATCH 08/33] trigger

---
 .github/workflows/build-docker-images.yml     |   2 +-
 .github/workflows/self-scheduled-amd.yml      | 528 +++++++++---------
 .../Dockerfile                                |   4 +-
 3 files changed, 267 insertions(+), 267 deletions(-)

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index c5d5d0402d56a5..201ea127ba07f2 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches:
       - build_ci_docker_image*
-      # - run_amd_scheduled_ci_caller_deepspeed_test
+      - run_amd_scheduled_ci_caller_deepspeed_test
   repository_dispatch:
   workflow_call:
     inputs:
diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml
index 410afbd635fa05..639a4df69c1de1 100644
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -107,256 +107,256 @@ jobs:
         run: |
           python3 utils/print_env.py
 
-  run_tests_single_gpu:
-    name: Single GPU tests
-    strategy:
-      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [single-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-    container:
-      image: huggingface/transformers-pytorch-amd-gpu
-      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
-    steps:
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-        # set the artifact folder names (because the character `/` is not allowed).
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: ROCM-SMI
-        run: |
-          rocm-smi
-      - name: ROCM-INFO
-        run: |
-          rocminfo  | grep "Agent" -A 14
-      - name: Show ROCR environment
-        run: |
-          echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
-
-  run_tests_multi_gpu:
-    name: Multi GPU tests
-    strategy:
-      max-parallel: 1
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-    container:
-      image: huggingface/transformers-pytorch-amd-gpu
-      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
-    steps:
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-        # set the artifact folder names (because the character `/` is not allowed).
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: ROCM-SMI
-        run: |
-          rocm-smi
-      - name: ROCM-INFO
-        run: |
-          rocminfo  | grep "Agent" -A 14
-      - name: Show ROCR environment
-        run: |
-          echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
-
-  run_examples_gpu:
-    name: Examples tests
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [single-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-    container:
-      image: huggingface/transformers-pytorch-amd-gpu
-      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
-    steps:
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: ROCM-SMI
-        run: |
-          rocm-smi
-      - name: ROCM-INFO
-        run: |
-          rocminfo  | grep "Agent" -A 14
-      - name: Show ROCR environment
-        run: |
-          echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run examples tests on GPU
-        working-directory: /transformers
-        run: |
-          pip install -r examples/pytorch/_tests_requirements.txt
-          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: ${{ matrix.machine_type }}_run_examples_gpu
-          path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
-
-  run_pipelines_torch_gpu:
-    name: PyTorch pipelines tests
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-    container:
-      image: huggingface/transformers-pytorch-amd-gpu
-      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
-    steps:
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: ROCM-SMI
-        run: |
-          rocm-smi
-      - name: ROCM-INFO
-        run: |
-          rocminfo  | grep "Agent" -A 14
-      - name: Show ROCR environment
-        run: |
-          echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all pipeline tests on GPU
-        working-directory: /transformers
-        run: |
-          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
-
-  run_all_tests_torch_rocm_deepspeed_gpu:
+  # run_tests_single_gpu:
+  #   name: Single GPU tests
+  #   strategy:
+  #     max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
+  #     fail-fast: false
+  #     matrix:
+  #       folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+  #       machine_type: [single-gpu]
+  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+  #   container:
+  #     image: huggingface/transformers-pytorch-amd-gpu
+  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+  #   needs: setup
+  #   steps:
+  #     - name: Echo folder ${{ matrix.folders }}
+  #       shell: bash
+  #       # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+  #       # set the artifact folder names (because the character `/` is not allowed).
+  #       run: |
+  #         echo "${{ matrix.folders }}"
+  #         matrix_folders=${{ matrix.folders }}
+  #         matrix_folders=${matrix_folders/'models/'/'models_'}
+  #         echo "$matrix_folders"
+  #         echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+  #     - name: Update clone
+  #       working-directory: /transformers
+  #       run: git fetch && git checkout ${{ github.sha }}
+
+  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+  #       working-directory: /transformers
+  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+  #     - name: ROCM-SMI
+  #       run: |
+  #         rocm-smi
+  #     - name: ROCM-INFO
+  #       run: |
+  #         rocminfo  | grep "Agent" -A 14
+  #     - name: Show ROCR environment
+  #       run: |
+  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  #     - name: Environment
+  #       working-directory: /transformers
+  #       run: |
+  #         python3 utils/print_env.py
+
+  #     - name: Show installed libraries and their versions
+  #       working-directory: /transformers
+  #       run: pip freeze
+
+  #     - name: Run all tests on GPU
+  #       working-directory: /transformers
+  #       run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+
+  #     - name: Failure short reports
+  #       if: ${{ failure() }}
+  #       continue-on-error: true
+  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+  #     - name: Test suite reports artifacts
+  #       if: ${{ always() }}
+  #       uses: actions/upload-artifact@v3
+  #       with:
+  #         name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  # run_tests_multi_gpu:
+  #   name: Multi GPU tests
+  #   strategy:
+  #     max-parallel: 1
+  #     fail-fast: false
+  #     matrix:
+  #       folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+  #       machine_type: [multi-gpu]
+  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+  #   container:
+  #     image: huggingface/transformers-pytorch-amd-gpu
+  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+  #   needs: setup
+  #   steps:
+  #     - name: Echo folder ${{ matrix.folders }}
+  #       shell: bash
+  #       # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+  #       # set the artifact folder names (because the character `/` is not allowed).
+  #       run: |
+  #         echo "${{ matrix.folders }}"
+  #         matrix_folders=${{ matrix.folders }}
+  #         matrix_folders=${matrix_folders/'models/'/'models_'}
+  #         echo "$matrix_folders"
+  #         echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+  #     - name: Update clone
+  #       working-directory: /transformers
+  #       run: git fetch && git checkout ${{ github.sha }}
+
+  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+  #       working-directory: /transformers
+  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+  #     - name: ROCM-SMI
+  #       run: |
+  #         rocm-smi
+  #     - name: ROCM-INFO
+  #       run: |
+  #         rocminfo  | grep "Agent" -A 14
+  #     - name: Show ROCR environment
+  #       run: |
+  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  #     - name: Environment
+  #       working-directory: /transformers
+  #       run: |
+  #         python3 utils/print_env.py
+
+  #     - name: Show installed libraries and their versions
+  #       working-directory: /transformers
+  #       run: pip freeze
+
+  #     - name: Run all tests on GPU
+  #       working-directory: /transformers
+  #       run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+
+  #     - name: Failure short reports
+  #       if: ${{ failure() }}
+  #       continue-on-error: true
+  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+  #     - name: Test suite reports artifacts
+  #       if: ${{ always() }}
+  #       uses: actions/upload-artifact@v3
+  #       with:
+  #         name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  # run_examples_gpu:
+  #   name: Examples tests
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       machine_type: [single-gpu]
+  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+  #   container:
+  #     image: huggingface/transformers-pytorch-amd-gpu
+  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+  #   needs: setup
+  #   steps:
+  #     - name: Update clone
+  #       working-directory: /transformers
+  #       run: git fetch && git checkout ${{ github.sha }}
+
+  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+  #       working-directory: /transformers
+  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+  #     - name: ROCM-SMI
+  #       run: |
+  #         rocm-smi
+  #     - name: ROCM-INFO
+  #       run: |
+  #         rocminfo  | grep "Agent" -A 14
+  #     - name: Show ROCR environment
+  #       run: |
+  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  #     - name: Environment
+  #       working-directory: /transformers
+  #       run: |
+  #         python3 utils/print_env.py
+
+  #     - name: Show installed libraries and their versions
+  #       working-directory: /transformers
+  #       run: pip freeze
+
+  #     - name: Run examples tests on GPU
+  #       working-directory: /transformers
+  #       run: |
+  #         pip install -r examples/pytorch/_tests_requirements.txt
+  #         python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
+
+  #     - name: Failure short reports
+  #       if: ${{ failure() }}
+  #       continue-on-error: true
+  #       run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
+
+  #     - name: Test suite reports artifacts
+  #       if: ${{ always() }}
+  #       uses: actions/upload-artifact@v3
+  #       with:
+  #         name: ${{ matrix.machine_type }}_run_examples_gpu
+  #         path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
+
+  # run_pipelines_torch_gpu:
+  #   name: PyTorch pipelines tests
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       machine_type: [single-gpu, multi-gpu]
+  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+  #   container:
+  #     image: huggingface/transformers-pytorch-amd-gpu
+  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+  #   needs: setup
+  #   steps:
+  #     - name: Update clone
+  #       working-directory: /transformers
+  #       run: git fetch && git checkout ${{ github.sha }}
+
+  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+  #       working-directory: /transformers
+  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+  #     - name: ROCM-SMI
+  #       run: |
+  #         rocm-smi
+  #     - name: ROCM-INFO
+  #       run: |
+  #         rocminfo  | grep "Agent" -A 14
+  #     - name: Show ROCR environment
+  #       run: |
+  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  #     - name: Environment
+  #       working-directory: /transformers
+  #       run: |
+  #         python3 utils/print_env.py
+
+  #     - name: Show installed libraries and their versions
+  #       working-directory: /transformers
+  #       run: pip freeze
+
+  #     - name: Run all pipeline tests on GPU
+  #       working-directory: /transformers
+  #       run: |
+  #         python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
+
+  #     - name: Failure short reports
+  #       if: ${{ failure() }}
+  #       continue-on-error: true
+  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
+
+  #     - name: Test suite reports artifacts
+  #       if: ${{ always() }}
+  #       uses: actions/upload-artifact@v3
+  #       with:
+  #         name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
+  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
+
+  run_tests_torch_deepspeed_gpu:
     name: Torch ROCm deepspeed tests
     strategy:
       fail-fast: false
@@ -398,19 +398,19 @@ jobs:
 
       - name: Run all tests on GPU
         working-directory: /transformers
-        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_rocm_deepspeed_gpu tests/deepspeed tests/extended
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_deepspeed_gpu tests/deepspeed tests/extended
 
       - name: Failure short reports
         if: ${{ failure() }}
         continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_rocm_deepspeed_gpu/failures_short.txt
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu/failures_short.txt
 
       - name: Test suite reports artifacts
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.machine_type }}_run_tests_torch_rocm_deepspeed_gpu_test_reports
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_rocm_deepspeed_gpu
+          name: ${{ matrix.machine_type }}_run_tests_torch_deepspeed_gpu_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu
 
   run_extract_warnings:
     name: Extract warnings in CI artifacts
@@ -420,11 +420,11 @@ jobs:
       check_runner_status,
       check_runners,
       setup,
-      run_tests_single_gpu,
-      run_tests_multi_gpu,
-      run_examples_gpu,
-      run_pipelines_torch_gpu,
-      run_all_tests_torch_rocm_deepspeed_gpu
+      # run_tests_single_gpu,
+      # run_tests_multi_gpu,
+      # run_examples_gpu,
+      # run_pipelines_torch_gpu,
+      run_tests_torch_deepspeed_gpu
     ]
     steps:
       - name: Checkout transformers
@@ -469,11 +469,11 @@ jobs:
       check_runner_status,
       check_runners,
       setup,
-      run_tests_single_gpu,
-      run_tests_multi_gpu,
-      run_examples_gpu,
-      run_pipelines_torch_gpu,
-      run_all_tests_torch_rocm_deepspeed_gpu,
+      # run_tests_single_gpu,
+      # run_tests_multi_gpu,
+      # run_examples_gpu,
+      # run_pipelines_torch_gpu,
+      run_tests_torch_deepspeed_gpu,
       run_extract_warnings
     ]
     steps:
diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
index 574951727c1835..cfbb5938fb37ae 100644
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM rocm/deepspeed:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1_DeepSpeed
+FROM rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
@@ -6,7 +6,7 @@ ARG PYTORCH='2.0.1'
 ARG ROCM='5.7'
 
 RUN apt update && \
-    apt install -y --no-install-recommends libaio-dev && \
+    apt install -y --no-install-recommends libaio-dev git && \
     apt clean && \
     rm -rf /var/lib/apt/lists/*
 

From a7033499d83d3aa0a81eb695dcaf1d0f49d7eaab Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 28 Nov 2023 13:53:57 +0100
Subject: [PATCH 09/33] change runner env to trigger the docker build image
 test

---
 .github/workflows/build-docker-images.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index 201ea127ba07f2..f15e68218c6197 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -22,7 +22,7 @@ jobs:
   latest-pytorch-deepspeed-amd:
     name: "PyTorch + DeepSpeed (AMD) [dev]"
 
-    runs-on: [self-hosted, yih-dar-shieh-debug-daily]
+    runs-on: [self-hosted, yih-dar-shieh-debug-doctest]
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3

From a47ac2ca1f98f4dc2b1eafeb31855c6b40e1fa56 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 28 Nov 2023 17:13:25 +0100
Subject: [PATCH 10/33] use new docker image

---
 .github/workflows/build-docker-images.yml | 2 +-
 .github/workflows/self-scheduled-amd.yml  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index f15e68218c6197..5bf28698f4ec1f 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches:
       - build_ci_docker_image*
-      - run_amd_scheduled_ci_caller_deepspeed_test
+      # - run_amd_scheduled_ci_caller_deepspeed_test
   repository_dispatch:
   workflow_call:
     inputs:
diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml
index 639a4df69c1de1..348652560f3d9b 100644
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -366,7 +366,7 @@ jobs:
     runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
     needs: setup
     container:
-      image: huggingface/transformers-pytorch-amd-gpu-test # replace with huggingface/transformers-pytorch-deepspeed-amd-gpu/
+      image: huggingface/transformers-pytorch-deepspeed-amd-gpu-test # remove -test
       options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
       - name: Update clone
@@ -375,7 +375,7 @@ jobs:
 
       - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
         working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[deepspeed]
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
 
       - name: ROCM-SMI
         run: |

From 233bd7f07a6e0fcf92a4558dc1dbc249170ec037 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 28 Nov 2023 23:45:52 +0100
Subject: [PATCH 11/33] remove test suffix from docker image tag

---
 .github/workflows/build-docker-images.yml       | 4 ++--
 .github/workflows/self-scheduled-amd-caller.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index 5bf28698f4ec1f..84fdc8ab68f232 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches:
       - build_ci_docker_image*
-      # - run_amd_scheduled_ci_caller_deepspeed_test
+      - run_amd_scheduled_ci_caller_deepspeed_test
   repository_dispatch:
   workflow_call:
     inputs:
@@ -40,4 +40,4 @@ jobs:
           build-args: |
             REF=main
           push: true
-          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}-test
+          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
diff --git a/.github/workflows/self-scheduled-amd-caller.yml b/.github/workflows/self-scheduled-amd-caller.yml
index 4755bd868249ca..883477e458fd65 100644
--- a/.github/workflows/self-scheduled-amd-caller.yml
+++ b/.github/workflows/self-scheduled-amd-caller.yml
@@ -5,7 +5,7 @@ on:
     - cron: "17 2 * * *"
   push:
     branches:
-      - run_amd_scheduled_ci_caller*
+      - run_amd_scheduled_ci_caller__*
 
 jobs:
   run_amd_ci_mi210:

From 971ba80a8cfc06110446fcad7ab5d5527c45602a Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Wed, 29 Nov 2023 00:00:14 +0100
Subject: [PATCH 12/33] replace test docker image with original image

---
 .github/workflows/build-docker-images.yml     | 270 +++++++++++++++++-
 .../workflows/self-scheduled-amd-caller.yml   |   2 +-
 .github/workflows/self-scheduled-amd.yml      |   2 +-
 3 files changed, 270 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index 84fdc8ab68f232..eb7b172888580b 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -4,7 +4,6 @@ on:
   push:
     branches:
       - build_ci_docker_image*
-      - run_amd_scheduled_ci_caller_deepspeed_test
   repository_dispatch:
   workflow_call:
     inputs:
@@ -19,10 +18,264 @@ concurrency:
   cancel-in-progress: false
 
 jobs:
+  latest-docker:
+    name: "Latest PyTorch + TensorFlow [dev]"
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-all-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-all-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-all-latest-gpu-push-ci
+
+  latest-torch-deepspeed-docker:
+    name: "Latest PyTorch + DeepSpeed"
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}
+
+  # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
+  latest-torch-deepspeed-docker-for-push-ci-daily-build:
+    name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
+
+  doc-builder:
+    name: "Doc builder"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on: ubuntu-22.04
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-doc-builder
+          push: true
+          tags: huggingface/transformers-doc-builder
+
+  latest-pytorch:
+    name: "Latest PyTorch [dev]"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-gpu
+
+# Need to be fixed with the help from Guillaume.
+#  latest-pytorch-amd:
+#    name: "Latest PyTorch (AMD) [dev]"
+#    runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
+#    steps:
+#      - name: Set up Docker Buildx
+#        uses: docker/setup-buildx-action@v3
+#      - name: Check out code
+#        uses: actions/checkout@v3
+#      - name: Login to DockerHub
+#        uses: docker/login-action@v3
+#        with:
+#          username: ${{ secrets.DOCKERHUB_USERNAME }}
+#          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+#      - name: Build and push
+#        uses: docker/build-push-action@v5
+#        with:
+#          context: ./docker/transformers-pytorch-amd-gpu
+#          build-args: |
+#            REF=main
+#          push: true
+#          tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
+#      # Push CI images still need to be re-built daily
+#      -
+#        name: Build and push (for Push CI) in a daily basis
+#        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+#        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+#        if: inputs.image_postfix != '-push-ci'
+#        uses: docker/build-push-action@v5
+#        with:
+#          context: ./docker/transformers-pytorch-amd-gpu
+#          build-args: |
+#            REF=main
+#          push: true
+#          tags: huggingface/transformers-pytorch-amd-gpu-push-ci
+
+  latest-tensorflow:
+    name: "Latest TensorFlow [dev]"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on: ubuntu-22.04
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-tensorflow-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-tensorflow-gpu
+
   latest-pytorch-deepspeed-amd:
     name: "PyTorch + DeepSpeed (AMD) [dev]"
 
-    runs-on: [self-hosted, yih-dar-shieh-debug-doctest]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
@@ -41,3 +294,16 @@ jobs:
             REF=main
           push: true
           tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-amd-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
diff --git a/.github/workflows/self-scheduled-amd-caller.yml b/.github/workflows/self-scheduled-amd-caller.yml
index 883477e458fd65..4755bd868249ca 100644
--- a/.github/workflows/self-scheduled-amd-caller.yml
+++ b/.github/workflows/self-scheduled-amd-caller.yml
@@ -5,7 +5,7 @@ on:
     - cron: "17 2 * * *"
   push:
     branches:
-      - run_amd_scheduled_ci_caller__*
+      - run_amd_scheduled_ci_caller*
 
 jobs:
   run_amd_ci_mi210:
diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml
index 348652560f3d9b..1cc6af0eed8ff2 100644
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -366,7 +366,7 @@ jobs:
     runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
     needs: setup
     container:
-      image: huggingface/transformers-pytorch-deepspeed-amd-gpu-test # remove -test
+      image: huggingface/transformers-pytorch-deepspeed-amd-gpu
       options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
       - name: Update clone

From da4774c0481147ebdd6f59109293010fad8cd2bd Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Wed, 29 Nov 2023 19:07:15 +0100
Subject: [PATCH 13/33] push new image

---
 .github/workflows/build-docker-images.yml     | 233 +-----------------
 .../workflows/self-scheduled-amd-caller.yml   |   2 +-
 .../Dockerfile                                |   2 +
 3 files changed, 6 insertions(+), 231 deletions(-)

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index eb7b172888580b..3dc558cc3e473a 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - build_ci_docker_image*
+      - run_amd_scheduled_ci_caller_deepspeed_test
   repository_dispatch:
   workflow_call:
     inputs:
@@ -18,195 +19,6 @@ concurrency:
   cancel-in-progress: false
 
 jobs:
-  latest-docker:
-    name: "Latest PyTorch + TensorFlow [dev]"
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-all-latest-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
-      # Push CI images still need to be re-built daily
-      -
-        name: Build and push (for Push CI) in a daily basis
-        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-        if: inputs.image_postfix != '-push-ci'
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-all-latest-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-all-latest-gpu-push-ci
-
-  latest-torch-deepspeed-docker:
-    name: "Latest PyTorch + DeepSpeed"
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}
-
-  # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
-  latest-torch-deepspeed-docker-for-push-ci-daily-build:
-    name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      # Push CI images still need to be re-built daily
-      -
-        name: Build and push (for Push CI) in a daily basis
-        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-        if: inputs.image_postfix != '-push-ci'
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
-
-  doc-builder:
-    name: "Doc builder"
-    # Push CI doesn't need this image
-    if: inputs.image_postfix != '-push-ci'
-    runs-on: ubuntu-22.04
-    steps:
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-doc-builder
-          push: true
-          tags: huggingface/transformers-doc-builder
-
-  latest-pytorch:
-    name: "Latest PyTorch [dev]"
-    # Push CI doesn't need this image
-    if: inputs.image_postfix != '-push-ci'
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-pytorch-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-pytorch-gpu
 
 # Need to be fixed with the help from Guillaume.
 #  latest-pytorch-amd:
@@ -244,38 +56,12 @@ jobs:
 #          push: true
 #          tags: huggingface/transformers-pytorch-amd-gpu-push-ci
 
-  latest-tensorflow:
-    name: "Latest TensorFlow [dev]"
-    # Push CI doesn't need this image
-    if: inputs.image_postfix != '-push-ci'
-    runs-on: ubuntu-22.04
-    steps:
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-tensorflow-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-tensorflow-gpu
+
 
   latest-pytorch-deepspeed-amd:
     name: "PyTorch + DeepSpeed (AMD) [dev]"
 
-    runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
+    runs-on: [self-hosted, yih-dar-shieh-debug-doctest]
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
@@ -294,16 +80,3 @@ jobs:
             REF=main
           push: true
           tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
-      # Push CI images still need to be re-built daily
-      -
-        name: Build and push (for Push CI) in a daily basis
-        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-        if: inputs.image_postfix != '-push-ci'
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-pytorch-deepspeed-amd-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
diff --git a/.github/workflows/self-scheduled-amd-caller.yml b/.github/workflows/self-scheduled-amd-caller.yml
index 4755bd868249ca..883477e458fd65 100644
--- a/.github/workflows/self-scheduled-amd-caller.yml
+++ b/.github/workflows/self-scheduled-amd-caller.yml
@@ -5,7 +5,7 @@ on:
     - cron: "17 2 * * *"
   push:
     branches:
-      - run_amd_scheduled_ci_caller*
+      - run_amd_scheduled_ci_caller__*
 
 jobs:
   run_amd_ci_mi210:
diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
index cfbb5938fb37ae..5090b14f9c283d 100644
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@@ -12,6 +12,8 @@ RUN apt update && \
 
 RUN python3 -m pip install --no-cache-dir --upgrade pip
 
+RUN python3 -m pip uninstall -y apex
+
 ARG REF=main
 WORKDIR /
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

From cbe995ff2041aa5f481d2543a388864192ea9dba Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 30 Nov 2023 11:19:46 +0100
Subject: [PATCH 14/33] Trigger


From e16c271403b94455d418e106cc22e00df778349c Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 30 Nov 2023 11:21:38 +0100
Subject: [PATCH 15/33] add back amd tests

---
 .github/workflows/build-docker-images.yml     | 233 +++++++++++++++++-
 .../workflows/self-scheduled-amd-caller.yml   |   2 +-
 2 files changed, 231 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index 3dc558cc3e473a..eb7b172888580b 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -4,7 +4,6 @@ on:
   push:
     branches:
       - build_ci_docker_image*
-      - run_amd_scheduled_ci_caller_deepspeed_test
   repository_dispatch:
   workflow_call:
     inputs:
@@ -19,6 +18,195 @@ concurrency:
   cancel-in-progress: false
 
 jobs:
+  latest-docker:
+    name: "Latest PyTorch + TensorFlow [dev]"
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-all-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-all-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-all-latest-gpu-push-ci
+
+  latest-torch-deepspeed-docker:
+    name: "Latest PyTorch + DeepSpeed"
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}
+
+  # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
+  latest-torch-deepspeed-docker-for-push-ci-daily-build:
+    name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
+
+  doc-builder:
+    name: "Doc builder"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on: ubuntu-22.04
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-doc-builder
+          push: true
+          tags: huggingface/transformers-doc-builder
+
+  latest-pytorch:
+    name: "Latest PyTorch [dev]"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-gpu
 
 # Need to be fixed with the help from Guillaume.
 #  latest-pytorch-amd:
@@ -56,12 +244,38 @@ jobs:
 #          push: true
 #          tags: huggingface/transformers-pytorch-amd-gpu-push-ci
 
-
+  latest-tensorflow:
+    name: "Latest TensorFlow [dev]"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on: ubuntu-22.04
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-tensorflow-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-tensorflow-gpu
 
   latest-pytorch-deepspeed-amd:
     name: "PyTorch + DeepSpeed (AMD) [dev]"
 
-    runs-on: [self-hosted, yih-dar-shieh-debug-doctest]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
@@ -80,3 +294,16 @@ jobs:
             REF=main
           push: true
           tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-amd-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
diff --git a/.github/workflows/self-scheduled-amd-caller.yml b/.github/workflows/self-scheduled-amd-caller.yml
index 883477e458fd65..4755bd868249ca 100644
--- a/.github/workflows/self-scheduled-amd-caller.yml
+++ b/.github/workflows/self-scheduled-amd-caller.yml
@@ -5,7 +5,7 @@ on:
     - cron: "17 2 * * *"
   push:
     branches:
-      - run_amd_scheduled_ci_caller__*
+      - run_amd_scheduled_ci_caller*
 
 jobs:
   run_amd_ci_mi210:

From 70c3580febcf837848ada43832b6758b2744ad1f Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 30 Nov 2023 11:25:49 +0100
Subject: [PATCH 16/33] fix typo

---
 docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
index 276f35f3351846..184639552eee7a 100644
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -34,7 +34,7 @@ RUN python3 -m pip uninstall -y torch-tensorrt
 
 # recompile apex
 RUN python3 -m pip uninstall -y apex
-RUN git clone https://github.com/NVIDIA/apex
+# RUN git clone https://github.com/NVIDIA/apex
 #  `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
 # TODO: check if there is alternative way to install latest apex
 # RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .

From 090b88e20466623a9aa6709487c5817dd6f7d774 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 30 Nov 2023 11:26:04 +0100
Subject: [PATCH 17/33] add amd tests back

---
 .github/workflows/self-scheduled-amd.yml | 512 +++++++++++------------
 1 file changed, 256 insertions(+), 256 deletions(-)

diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml
index 1cc6af0eed8ff2..54ab319897ef2e 100644
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -107,254 +107,254 @@ jobs:
         run: |
           python3 utils/print_env.py
 
-  # run_tests_single_gpu:
-  #   name: Single GPU tests
-  #   strategy:
-  #     max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
-  #     fail-fast: false
-  #     matrix:
-  #       folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-  #       machine_type: [single-gpu]
-  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-  #   container:
-  #     image: huggingface/transformers-pytorch-amd-gpu
-  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-  #   needs: setup
-  #   steps:
-  #     - name: Echo folder ${{ matrix.folders }}
-  #       shell: bash
-  #       # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-  #       # set the artifact folder names (because the character `/` is not allowed).
-  #       run: |
-  #         echo "${{ matrix.folders }}"
-  #         matrix_folders=${{ matrix.folders }}
-  #         matrix_folders=${matrix_folders/'models/'/'models_'}
-  #         echo "$matrix_folders"
-  #         echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-  #     - name: Update clone
-  #       working-directory: /transformers
-  #       run: git fetch && git checkout ${{ github.sha }}
-
-  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-  #       working-directory: /transformers
-  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-  #     - name: ROCM-SMI
-  #       run: |
-  #         rocm-smi
-  #     - name: ROCM-INFO
-  #       run: |
-  #         rocminfo  | grep "Agent" -A 14
-  #     - name: Show ROCR environment
-  #       run: |
-  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-  #     - name: Environment
-  #       working-directory: /transformers
-  #       run: |
-  #         python3 utils/print_env.py
-
-  #     - name: Show installed libraries and their versions
-  #       working-directory: /transformers
-  #       run: pip freeze
-
-  #     - name: Run all tests on GPU
-  #       working-directory: /transformers
-  #       run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
-
-  #     - name: Failure short reports
-  #       if: ${{ failure() }}
-  #       continue-on-error: true
-  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
-
-  #     - name: Test suite reports artifacts
-  #       if: ${{ always() }}
-  #       uses: actions/upload-artifact@v3
-  #       with:
-  #         name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
-  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
-
-  # run_tests_multi_gpu:
-  #   name: Multi GPU tests
-  #   strategy:
-  #     max-parallel: 1
-  #     fail-fast: false
-  #     matrix:
-  #       folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-  #       machine_type: [multi-gpu]
-  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-  #   container:
-  #     image: huggingface/transformers-pytorch-amd-gpu
-  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-  #   needs: setup
-  #   steps:
-  #     - name: Echo folder ${{ matrix.folders }}
-  #       shell: bash
-  #       # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-  #       # set the artifact folder names (because the character `/` is not allowed).
-  #       run: |
-  #         echo "${{ matrix.folders }}"
-  #         matrix_folders=${{ matrix.folders }}
-  #         matrix_folders=${matrix_folders/'models/'/'models_'}
-  #         echo "$matrix_folders"
-  #         echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-  #     - name: Update clone
-  #       working-directory: /transformers
-  #       run: git fetch && git checkout ${{ github.sha }}
-
-  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-  #       working-directory: /transformers
-  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-  #     - name: ROCM-SMI
-  #       run: |
-  #         rocm-smi
-  #     - name: ROCM-INFO
-  #       run: |
-  #         rocminfo  | grep "Agent" -A 14
-  #     - name: Show ROCR environment
-  #       run: |
-  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-  #     - name: Environment
-  #       working-directory: /transformers
-  #       run: |
-  #         python3 utils/print_env.py
-
-  #     - name: Show installed libraries and their versions
-  #       working-directory: /transformers
-  #       run: pip freeze
-
-  #     - name: Run all tests on GPU
-  #       working-directory: /transformers
-  #       run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
-
-  #     - name: Failure short reports
-  #       if: ${{ failure() }}
-  #       continue-on-error: true
-  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
-
-  #     - name: Test suite reports artifacts
-  #       if: ${{ always() }}
-  #       uses: actions/upload-artifact@v3
-  #       with:
-  #         name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
-  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
-
-  # run_examples_gpu:
-  #   name: Examples tests
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       machine_type: [single-gpu]
-  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-  #   container:
-  #     image: huggingface/transformers-pytorch-amd-gpu
-  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-  #   needs: setup
-  #   steps:
-  #     - name: Update clone
-  #       working-directory: /transformers
-  #       run: git fetch && git checkout ${{ github.sha }}
-
-  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-  #       working-directory: /transformers
-  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-  #     - name: ROCM-SMI
-  #       run: |
-  #         rocm-smi
-  #     - name: ROCM-INFO
-  #       run: |
-  #         rocminfo  | grep "Agent" -A 14
-  #     - name: Show ROCR environment
-  #       run: |
-  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-  #     - name: Environment
-  #       working-directory: /transformers
-  #       run: |
-  #         python3 utils/print_env.py
-
-  #     - name: Show installed libraries and their versions
-  #       working-directory: /transformers
-  #       run: pip freeze
-
-  #     - name: Run examples tests on GPU
-  #       working-directory: /transformers
-  #       run: |
-  #         pip install -r examples/pytorch/_tests_requirements.txt
-  #         python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
-
-  #     - name: Failure short reports
-  #       if: ${{ failure() }}
-  #       continue-on-error: true
-  #       run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
-
-  #     - name: Test suite reports artifacts
-  #       if: ${{ always() }}
-  #       uses: actions/upload-artifact@v3
-  #       with:
-  #         name: ${{ matrix.machine_type }}_run_examples_gpu
-  #         path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
-
-  # run_pipelines_torch_gpu:
-  #   name: PyTorch pipelines tests
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       machine_type: [single-gpu, multi-gpu]
-  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-  #   container:
-  #     image: huggingface/transformers-pytorch-amd-gpu
-  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-  #   needs: setup
-  #   steps:
-  #     - name: Update clone
-  #       working-directory: /transformers
-  #       run: git fetch && git checkout ${{ github.sha }}
-
-  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-  #       working-directory: /transformers
-  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-  #     - name: ROCM-SMI
-  #       run: |
-  #         rocm-smi
-  #     - name: ROCM-INFO
-  #       run: |
-  #         rocminfo  | grep "Agent" -A 14
-  #     - name: Show ROCR environment
-  #       run: |
-  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-  #     - name: Environment
-  #       working-directory: /transformers
-  #       run: |
-  #         python3 utils/print_env.py
-
-  #     - name: Show installed libraries and their versions
-  #       working-directory: /transformers
-  #       run: pip freeze
-
-  #     - name: Run all pipeline tests on GPU
-  #       working-directory: /transformers
-  #       run: |
-  #         python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
-
-  #     - name: Failure short reports
-  #       if: ${{ failure() }}
-  #       continue-on-error: true
-  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
-
-  #     - name: Test suite reports artifacts
-  #       if: ${{ always() }}
-  #       uses: actions/upload-artifact@v3
-  #       with:
-  #         name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
-  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
+  run_tests_single_gpu:
+    name: Single GPU tests
+    strategy:
+      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+        machine_type: [single-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  run_tests_multi_gpu:
+    name: Multi GPU tests
+    strategy:
+      max-parallel: 1
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+        machine_type: [multi-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  run_examples_gpu:
+    name: Examples tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run examples tests on GPU
+        working-directory: /transformers
+        run: |
+          pip install -r examples/pytorch/_tests_requirements.txt
+          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_examples_gpu
+          path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
+
+  run_pipelines_torch_gpu:
+    name: PyTorch pipelines tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all pipeline tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
 
   run_tests_torch_deepspeed_gpu:
     name: Torch ROCm deepspeed tests
@@ -420,10 +420,10 @@ jobs:
       check_runner_status,
       check_runners,
       setup,
-      # run_tests_single_gpu,
-      # run_tests_multi_gpu,
-      # run_examples_gpu,
-      # run_pipelines_torch_gpu,
+      run_tests_single_gpu,
+      run_tests_multi_gpu,
+      run_examples_gpu,
+      run_pipelines_torch_gpu,
       run_tests_torch_deepspeed_gpu
     ]
     steps:
@@ -469,10 +469,10 @@ jobs:
       check_runner_status,
       check_runners,
       setup,
-      # run_tests_single_gpu,
-      # run_tests_multi_gpu,
-      # run_examples_gpu,
-      # run_pipelines_torch_gpu,
+      run_tests_single_gpu,
+      run_tests_multi_gpu,
+      run_examples_gpu,
+      run_pipelines_torch_gpu,
       run_tests_torch_deepspeed_gpu,
       run_extract_warnings
     ]

From 508ae294fdc9012ae7f87f9895c1a80c3ce876c5 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 30 Nov 2023 15:37:35 +0100
Subject: [PATCH 18/33] fix

---
 .github/workflows/self-scheduled-amd-caller.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/self-scheduled-amd-caller.yml b/.github/workflows/self-scheduled-amd-caller.yml
index 4755bd868249ca..883477e458fd65 100644
--- a/.github/workflows/self-scheduled-amd-caller.yml
+++ b/.github/workflows/self-scheduled-amd-caller.yml
@@ -5,7 +5,7 @@ on:
     - cron: "17 2 * * *"
   push:
     branches:
-      - run_amd_scheduled_ci_caller*
+      - run_amd_scheduled_ci_caller__*
 
 jobs:
   run_amd_ci_mi210:

From 09fee9eac76ba46427662d430b23c7c2cb344c4e Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 30 Nov 2023 15:57:12 +0100
Subject: [PATCH 19/33] comment until docker image build scheduled test fix

---
 .github/workflows/build-docker-images.yml | 68 +++++++++++------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index eb7b172888580b..be070a95d3a94f 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -272,38 +272,38 @@ jobs:
           push: true
           tags: huggingface/transformers-tensorflow-gpu
 
-  latest-pytorch-deepspeed-amd:
-    name: "PyTorch + DeepSpeed (AMD) [dev]"
+  # latest-pytorch-deepspeed-amd:
+  #   name: "PyTorch + DeepSpeed (AMD) [dev]"
 
-    runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
-    steps:
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Check out code
-        uses: actions/checkout@v3
-      - name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      - name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-pytorch-deepspeed-amd-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
-      # Push CI images still need to be re-built daily
-      -
-        name: Build and push (for Push CI) in a daily basis
-        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-        if: inputs.image_postfix != '-push-ci'
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-pytorch-deepspeed-amd-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
+  #   runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
+  #   steps:
+  #     - name: Set up Docker Buildx
+  #       uses: docker/setup-buildx-action@v3
+  #     - name: Check out code
+  #       uses: actions/checkout@v3
+  #     - name: Login to DockerHub
+  #       uses: docker/login-action@v3
+  #       with:
+  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
+  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
+  #     - name: Build and push
+  #       uses: docker/build-push-action@v5
+  #       with:
+  #         context: ./docker/transformers-pytorch-deepspeed-amd-gpu
+  #         build-args: |
+  #           REF=main
+  #         push: true
+  #         tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
+  #     # Push CI images still need to be re-built daily
+  #     -
+  #       name: Build and push (for Push CI) in a daily basis
+  #       # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+  #       # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+  #       if: inputs.image_postfix != '-push-ci'
+  #       uses: docker/build-push-action@v5
+  #       with:
+  #         context: ./docker/transformers-pytorch-deepspeed-amd-gpu
+  #         build-args: |
+  #           REF=main
+  #         push: true
+  #         tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci

From 407cfe972256448fae967fa4504483319a9a7696 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 30 Nov 2023 16:21:43 +0100
Subject: [PATCH 20/33] remove deprecated deepspeed build option

---
 .github/workflows/self-nightly-scheduled.yml                | 4 ++--
 .github/workflows/self-past.yml                             | 4 ++--
 .github/workflows/self-push.yml                             | 4 ++--
 .github/workflows/self-scheduled.yml                        | 2 +-
 docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml
index e4b4f7f77cf077..37dc98f340a16d 100644
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@@ -212,7 +212,7 @@ jobs:
           python3 -m pip uninstall -y deepspeed
           rm -rf DeepSpeed
           git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
@@ -286,4 +286,4 @@ jobs:
         with:
           name: |
               single-*
-              multi-*
\ No newline at end of file
+              multi-*
diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml
index 6a154544df8b97..d0ce313bd48e0a 100644
--- a/.github/workflows/self-past.yml
+++ b/.github/workflows/self-past.yml
@@ -255,7 +255,7 @@ jobs:
           python3 -m pip uninstall -y deepspeed
           rm -rf DeepSpeed
           git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
@@ -341,4 +341,4 @@ jobs:
         with:
           name: |
               single-*
-              multi-*
\ No newline at end of file
+              multi-*
diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
index a6ea5b1e04b942..e6f1f3b3050f7a 100644
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -366,7 +366,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
@@ -456,7 +456,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 4a04cb14ac7bb3..bf7f579a9e3ad2 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -366,7 +366,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
index 184639552eee7a..a8789f9170e721 100644
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -44,7 +44,7 @@ RUN python3 -m pip uninstall -y deepspeed
 # This has to be run (again) inside the GPU VMs running the tests.
 # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
 # TODO: Find out why test fail.
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
 
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.

From f846b80bed3b75471edfde5ae3b617d0b235765a Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 30 Nov 2023 18:20:53 +0100
Subject: [PATCH 21/33] upgrade torch

---
 .../Dockerfile                                       | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
index 5090b14f9c283d..817abd42df7baa 100644
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@@ -1,9 +1,11 @@
-FROM rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
+FROM rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
-ARG PYTORCH='2.0.1'
-ARG ROCM='5.7'
+ARG PYTORCH='2.1.0'
+ARG TORCH_VISION='0.16.0'
+ARG TORCH_AUDIO='2.1.0'
+ARG ROCM='5.6'
 
 RUN apt update && \
     apt install -y --no-install-recommends libaio-dev git && \
@@ -14,6 +16,10 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 
 RUN python3 -m pip uninstall -y apex
 
+RUN python3 -m pip uninstall -y torch torchvision torchaudio
+
+RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM
+
 ARG REF=main
 WORKDIR /
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

From 785b63aed48b6b15b538aa49c71327cd7fb98d50 Mon Sep 17 00:00:00 2001
From: Felix Marty <felix@hf.co>
Date: Mon, 4 Dec 2023 15:30:47 +0000
Subject: [PATCH 22/33] update docker & make tests pass

---
 .../transformers-pytorch-amd-gpu/Dockerfile   |  4 +++
 .../Dockerfile                                | 32 ++++++++++++-------
 tests/deepspeed/test_deepspeed.py             |  4 +--
 3 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/docker/transformers-pytorch-amd-gpu/Dockerfile b/docker/transformers-pytorch-amd-gpu/Dockerfile
index 216ff4c4385548..46ca1a531b4ab4 100644
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@@ -22,7 +22,11 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools ninja git+htt
 
 ARG REF=main
 WORKDIR /
+
+# Invalidate docker cache from here if new commit is available.
+ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
 
 RUN python3 -m pip uninstall -y tensorflow flax
diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
index 817abd42df7baa..e426b3ee75809d 100644
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@@ -1,32 +1,40 @@
-FROM rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1
+FROM rocm/dev-ubuntu-22.04:5.6
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
-ARG PYTORCH='2.1.0'
-ARG TORCH_VISION='0.16.0'
-ARG TORCH_AUDIO='2.1.0'
+ARG PYTORCH='2.1.1'
+ARG TORCH_VISION='0.16.1'
+ARG TORCH_AUDIO='2.1.1'
 ARG ROCM='5.6'
 
 RUN apt update && \
-    apt install -y --no-install-recommends libaio-dev git && \
+    apt install -y --no-install-recommends \
+    libaio-dev \
+    git \
+    # These are required to build deepspeed.
+    python3-dev \
+    rocrand-dev \
+    rocthrust-dev \
+    hipsparse-dev \
+    hipblas-dev && \
     apt clean && \
     rm -rf /var/lib/apt/lists/*
 
 RUN python3 -m pip install --no-cache-dir --upgrade pip
-
-RUN python3 -m pip uninstall -y apex
-
-RUN python3 -m pip uninstall -y torch torchvision torchaudio
-
-RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM
+RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
+RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir
 
 ARG REF=main
 WORKDIR /
+
+# Invalidate docker cache from here if new commit is available.
+ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
 
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
 
-RUN python3 -c "from deepspeed.launcher.runner import main"
\ No newline at end of file
+RUN python3 -c "from deepspeed.launcher.runner import main"
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 2352cf522f29a7..14c8f6703166c9 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -561,8 +561,8 @@ def test_gradient_accumulation(self, stage, dtype):
         self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5)
         self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5)
 
-        # see the note above how to get identical loss on a small bs
-        self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=2)
+        # Relative difference. See the note above how to get identical loss on a small bs
+        self.assertTrue((no_grad_accum_loss - yes_grad_accum_loss) / (no_grad_accum_loss + 1e-15) <= 1e-3)
 
     def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage, dtype):
         # adapted from TrainerIntegrationCommon.check_saved_checkpoints

From f0f931e2fcf5bdfc1c510a53d9787f339785e683 Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Tue, 5 Dec 2023 19:34:58 +0900
Subject: [PATCH 23/33] Update
 docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile

---
 docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
index e426b3ee75809d..36a2d8fafd8eb4 100644
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@@ -16,7 +16,8 @@ RUN apt update && \
     rocrand-dev \
     rocthrust-dev \
     hipsparse-dev \
-    hipblas-dev && \
+    hipblas-dev \
+    rocblas-dev && \
     apt clean && \
     rm -rf /var/lib/apt/lists/*
 

From 40398b9a0f4bb08ea2829e1fac8bb44cb5b9812b Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 5 Dec 2023 11:47:45 +0100
Subject: [PATCH 24/33] fix

---
 .github/workflows/self-scheduled-amd-caller.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/self-scheduled-amd-caller.yml b/.github/workflows/self-scheduled-amd-caller.yml
index 14d46f453c31d6..fb02d2742d16dc 100644
--- a/.github/workflows/self-scheduled-amd-caller.yml
+++ b/.github/workflows/self-scheduled-amd-caller.yml
@@ -3,9 +3,6 @@ name: Self-hosted runner (AMD scheduled CI caller)
 on:
   schedule:
     - cron: "17 2 * * *"
-  push:
-    branches:
-      - run_amd_scheduled_ci_caller*
 
 jobs:
   run_scheduled_amd_ci:
@@ -14,4 +11,4 @@ jobs:
     if: ${{ always() }}
     steps:
       - name: Trigger scheduled AMD CI via workflow_run
-        run: echo "Trigger scheduled AMD CI via workflow_run"
+        run: echo "Trigger scheduled AMD CI via workflow_run"
\ No newline at end of file

From 3332cd2eb55005b597708805699285948d0a2f3e Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 5 Dec 2023 11:47:57 +0100
Subject: [PATCH 25/33] tmp disable test

---
 .github/workflows/self-scheduled-amd-mi210-caller.yml | 2 +-
 .github/workflows/self-scheduled-amd-mi250-caller.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/self-scheduled-amd-mi210-caller.yml b/.github/workflows/self-scheduled-amd-mi210-caller.yml
index ceaba454ae642e..68294abfa2a730 100644
--- a/.github/workflows/self-scheduled-amd-mi210-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml
@@ -7,7 +7,7 @@ on:
     types: [completed]
   push:
     branches:
-      - run_amd_scheduled_ci_caller*
+      - run_amd_scheduled_ci_caller__*
 
 jobs:
   run_amd_ci:
diff --git a/.github/workflows/self-scheduled-amd-mi250-caller.yml b/.github/workflows/self-scheduled-amd-mi250-caller.yml
index 843e3476342e9d..748c6ec0d18711 100644
--- a/.github/workflows/self-scheduled-amd-mi250-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml
@@ -7,7 +7,7 @@ on:
     types: [completed]
   push:
     branches:
-      - run_amd_scheduled_ci_caller*
+      - run_amd_scheduled_ci_caller__*
 
 jobs:
   run_amd_ci:

From 9696cc4ef7b8859b830b9e4cb1be1cf881945bcd Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 5 Dec 2023 14:28:28 +0100
Subject: [PATCH 26/33] precompile deepspeed to avoid timeout during tests

---
 docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
index 36a2d8fafd8eb4..466af2fcbf889f 100644
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@@ -21,10 +21,13 @@ RUN apt update && \
     apt clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN python3 -m pip install --no-cache-dir --upgrade pip
+RUN python3 -m pip install --no-cache-dir --upgrade pip ninja "pydantic<2"
 RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
 RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir
 
+# Pre-build **latest** DeepSpeed, so it's be ready for testing (to avoid timeout)
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
+
 ARG REF=main
 WORKDIR /
 
@@ -32,10 +35,10 @@ WORKDIR /
 ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
 
-RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+RUN python3 -m pip install --no-cache-dir ./transformers[accelerate,testing,sentencepiece]
 
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
 
-RUN python3 -c "from deepspeed.launcher.runner import main"
+RUN python3 -c "from deepspeed.launcher.runner import main"
\ No newline at end of file

From 84a7a3398de8c83c66f7d5ca2600d65407dd5715 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 5 Dec 2023 16:13:11 +0100
Subject: [PATCH 27/33] fix comment

---
 docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
index 466af2fcbf889f..e309d555306bdb 100644
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@@ -25,7 +25,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip ninja "pydantic<2"
 RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
 RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir
 
-# Pre-build **latest** DeepSpeed, so it's be ready for testing (to avoid timeout)
+# Pre-build DeepSpeed, so it's be ready for testing (to avoid timeout)
 RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
 
 ARG REF=main

From df00cff6caa4722a26f1de9c3219ce8e28c7c5c0 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 5 Dec 2023 17:56:47 +0100
Subject: [PATCH 28/33] trigger deepspeed tests with new image

---
 .../self-scheduled-amd-mi210-caller.yml       |   2 +-
 .../self-scheduled-amd-mi250-caller.yml       |   2 +-
 .github/workflows/self-scheduled-amd.yml      | 498 +++++++++---------
 3 files changed, 251 insertions(+), 251 deletions(-)

diff --git a/.github/workflows/self-scheduled-amd-mi210-caller.yml b/.github/workflows/self-scheduled-amd-mi210-caller.yml
index 68294abfa2a730..ceaba454ae642e 100644
--- a/.github/workflows/self-scheduled-amd-mi210-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml
@@ -7,7 +7,7 @@ on:
     types: [completed]
   push:
     branches:
-      - run_amd_scheduled_ci_caller__*
+      - run_amd_scheduled_ci_caller*
 
 jobs:
   run_amd_ci:
diff --git a/.github/workflows/self-scheduled-amd-mi250-caller.yml b/.github/workflows/self-scheduled-amd-mi250-caller.yml
index 748c6ec0d18711..843e3476342e9d 100644
--- a/.github/workflows/self-scheduled-amd-mi250-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml
@@ -7,7 +7,7 @@ on:
     types: [completed]
   push:
     branches:
-      - run_amd_scheduled_ci_caller__*
+      - run_amd_scheduled_ci_caller*
 
 jobs:
   run_amd_ci:
diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml
index 997b5ed4ee16eb..70f844da792039 100644
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -107,254 +107,254 @@ jobs:
         run: |
           python3 utils/print_env.py
 
-  run_tests_single_gpu:
-    name: Single GPU tests
-    strategy:
-      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [single-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-    container:
-      image: huggingface/transformers-pytorch-amd-gpu
-      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
-    steps:
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-        # set the artifact folder names (because the character `/` is not allowed).
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: ROCM-SMI
-        run: |
-          rocm-smi
-      - name: ROCM-INFO
-        run: |
-          rocminfo  | grep "Agent" -A 14
-      - name: Show ROCR environment
-        run: |
-          echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
-
-  run_tests_multi_gpu:
-    name: Multi GPU tests
-    strategy:
-      max-parallel: 1
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-    container:
-      image: huggingface/transformers-pytorch-amd-gpu
-      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
-    steps:
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-        # set the artifact folder names (because the character `/` is not allowed).
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: ROCM-SMI
-        run: |
-          rocm-smi
-      - name: ROCM-INFO
-        run: |
-          rocminfo  | grep "Agent" -A 14
-      - name: Show ROCR environment
-        run: |
-          echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
-
-  run_examples_gpu:
-    name: Examples tests
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [single-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-    container:
-      image: huggingface/transformers-pytorch-amd-gpu
-      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
-    steps:
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: ROCM-SMI
-        run: |
-          rocm-smi
-      - name: ROCM-INFO
-        run: |
-          rocminfo  | grep "Agent" -A 14
-      - name: Show ROCR environment
-        run: |
-          echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run examples tests on GPU
-        working-directory: /transformers
-        run: |
-          pip install -r examples/pytorch/_tests_requirements.txt
-          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: ${{ matrix.machine_type }}_run_examples_gpu
-          path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
-
-  run_pipelines_torch_gpu:
-    name: PyTorch pipelines tests
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-    container:
-      image: huggingface/transformers-pytorch-amd-gpu
-      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
-    steps:
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: ROCM-SMI
-        run: |
-          rocm-smi
-      - name: ROCM-INFO
-        run: |
-          rocminfo  | grep "Agent" -A 14
-      - name: Show ROCR environment
-        run: |
-          echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all pipeline tests on GPU
-        working-directory: /transformers
-        run: |
-          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
+  # run_tests_single_gpu:
+  #   name: Single GPU tests
+  #   strategy:
+  #     max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
+  #     fail-fast: false
+  #     matrix:
+  #       folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+  #       machine_type: [single-gpu]
+  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+  #   container:
+  #     image: huggingface/transformers-pytorch-amd-gpu
+  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+  #   needs: setup
+  #   steps:
+  #     - name: Echo folder ${{ matrix.folders }}
+  #       shell: bash
+  #       # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+  #       # set the artifact folder names (because the character `/` is not allowed).
+  #       run: |
+  #         echo "${{ matrix.folders }}"
+  #         matrix_folders=${{ matrix.folders }}
+  #         matrix_folders=${matrix_folders/'models/'/'models_'}
+  #         echo "$matrix_folders"
+  #         echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+  #     - name: Update clone
+  #       working-directory: /transformers
+  #       run: git fetch && git checkout ${{ github.sha }}
+
+  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+  #       working-directory: /transformers
+  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+  #     - name: ROCM-SMI
+  #       run: |
+  #         rocm-smi
+  #     - name: ROCM-INFO
+  #       run: |
+  #         rocminfo  | grep "Agent" -A 14
+  #     - name: Show ROCR environment
+  #       run: |
+  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  #     - name: Environment
+  #       working-directory: /transformers
+  #       run: |
+  #         python3 utils/print_env.py
+
+  #     - name: Show installed libraries and their versions
+  #       working-directory: /transformers
+  #       run: pip freeze
+
+  #     - name: Run all tests on GPU
+  #       working-directory: /transformers
+  #       run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+
+  #     - name: Failure short reports
+  #       if: ${{ failure() }}
+  #       continue-on-error: true
+  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+  #     - name: Test suite reports artifacts
+  #       if: ${{ always() }}
+  #       uses: actions/upload-artifact@v3
+  #       with:
+  #         name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  # run_tests_multi_gpu:
+  #   name: Multi GPU tests
+  #   strategy:
+  #     max-parallel: 1
+  #     fail-fast: false
+  #     matrix:
+  #       folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+  #       machine_type: [multi-gpu]
+  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+  #   container:
+  #     image: huggingface/transformers-pytorch-amd-gpu
+  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+  #   needs: setup
+  #   steps:
+  #     - name: Echo folder ${{ matrix.folders }}
+  #       shell: bash
+  #       # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+  #       # set the artifact folder names (because the character `/` is not allowed).
+  #       run: |
+  #         echo "${{ matrix.folders }}"
+  #         matrix_folders=${{ matrix.folders }}
+  #         matrix_folders=${matrix_folders/'models/'/'models_'}
+  #         echo "$matrix_folders"
+  #         echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+  #     - name: Update clone
+  #       working-directory: /transformers
+  #       run: git fetch && git checkout ${{ github.sha }}
+
+  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+  #       working-directory: /transformers
+  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+  #     - name: ROCM-SMI
+  #       run: |
+  #         rocm-smi
+  #     - name: ROCM-INFO
+  #       run: |
+  #         rocminfo  | grep "Agent" -A 14
+  #     - name: Show ROCR environment
+  #       run: |
+  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  #     - name: Environment
+  #       working-directory: /transformers
+  #       run: |
+  #         python3 utils/print_env.py
+
+  #     - name: Show installed libraries and their versions
+  #       working-directory: /transformers
+  #       run: pip freeze
+
+  #     - name: Run all tests on GPU
+  #       working-directory: /transformers
+  #       run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+
+  #     - name: Failure short reports
+  #       if: ${{ failure() }}
+  #       continue-on-error: true
+  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+  #     - name: Test suite reports artifacts
+  #       if: ${{ always() }}
+  #       uses: actions/upload-artifact@v3
+  #       with:
+  #         name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  # run_examples_gpu:
+  #   name: Examples tests
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       machine_type: [single-gpu]
+  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+  #   container:
+  #     image: huggingface/transformers-pytorch-amd-gpu
+  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+  #   needs: setup
+  #   steps:
+  #     - name: Update clone
+  #       working-directory: /transformers
+  #       run: git fetch && git checkout ${{ github.sha }}
+
+  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+  #       working-directory: /transformers
+  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+  #     - name: ROCM-SMI
+  #       run: |
+  #         rocm-smi
+  #     - name: ROCM-INFO
+  #       run: |
+  #         rocminfo  | grep "Agent" -A 14
+  #     - name: Show ROCR environment
+  #       run: |
+  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  #     - name: Environment
+  #       working-directory: /transformers
+  #       run: |
+  #         python3 utils/print_env.py
+
+  #     - name: Show installed libraries and their versions
+  #       working-directory: /transformers
+  #       run: pip freeze
+
+  #     - name: Run examples tests on GPU
+  #       working-directory: /transformers
+  #       run: |
+  #         pip install -r examples/pytorch/_tests_requirements.txt
+  #         python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
+
+  #     - name: Failure short reports
+  #       if: ${{ failure() }}
+  #       continue-on-error: true
+  #       run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
+
+  #     - name: Test suite reports artifacts
+  #       if: ${{ always() }}
+  #       uses: actions/upload-artifact@v3
+  #       with:
+  #         name: ${{ matrix.machine_type }}_run_examples_gpu
+  #         path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
+
+  # run_pipelines_torch_gpu:
+  #   name: PyTorch pipelines tests
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       machine_type: [single-gpu, multi-gpu]
+  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+  #   container:
+  #     image: huggingface/transformers-pytorch-amd-gpu
+  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+  #   needs: setup
+  #   steps:
+  #     - name: Update clone
+  #       working-directory: /transformers
+  #       run: git fetch && git checkout ${{ github.sha }}
+
+  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+  #       working-directory: /transformers
+  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+  #     - name: ROCM-SMI
+  #       run: |
+  #         rocm-smi
+  #     - name: ROCM-INFO
+  #       run: |
+  #         rocminfo  | grep "Agent" -A 14
+  #     - name: Show ROCR environment
+  #       run: |
+  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  #     - name: Environment
+  #       working-directory: /transformers
+  #       run: |
+  #         python3 utils/print_env.py
+
+  #     - name: Show installed libraries and their versions
+  #       working-directory: /transformers
+  #       run: pip freeze
+
+  #     - name: Run all pipeline tests on GPU
+  #       working-directory: /transformers
+  #       run: |
+  #         python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
+
+  #     - name: Failure short reports
+  #       if: ${{ failure() }}
+  #       continue-on-error: true
+  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
+
+  #     - name: Test suite reports artifacts
+  #       if: ${{ always() }}
+  #       uses: actions/upload-artifact@v3
+  #       with:
+  #         name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
+  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
 
   run_tests_torch_deepspeed_gpu:
     name: Torch ROCm deepspeed tests
@@ -366,7 +366,7 @@ jobs:
     runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
     needs: setup
     container:
-      image: huggingface/transformers-pytorch-deepspeed-amd-gpu
+      image: echarlaix/amd-deepspeed-test # TODO: replace with huggingface/transformers-pytorch-deepspeed-amd-gpu
       options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
       - name: Update clone

From 92c402d9e48ae6f4b68caab8115259b42ad70cd5 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 5 Dec 2023 18:04:02 +0100
Subject: [PATCH 29/33] comment tests

---
 .github/workflows/self-scheduled-amd-caller.yml |  2 +-
 .github/workflows/self-scheduled-amd.yml        | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/self-scheduled-amd-caller.yml b/.github/workflows/self-scheduled-amd-caller.yml
index fb02d2742d16dc..dc5c7b7e905bd8 100644
--- a/.github/workflows/self-scheduled-amd-caller.yml
+++ b/.github/workflows/self-scheduled-amd-caller.yml
@@ -11,4 +11,4 @@ jobs:
     if: ${{ always() }}
     steps:
       - name: Trigger scheduled AMD CI via workflow_run
-        run: echo "Trigger scheduled AMD CI via workflow_run"
\ No newline at end of file
+        run: echo "Trigger scheduled AMD CI via workflow_run"
diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml
index 70f844da792039..1ac5ef980c95e2 100644
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -420,10 +420,10 @@ jobs:
       check_runner_status,
       check_runners,
       setup,
-      run_tests_single_gpu,
-      run_tests_multi_gpu,
-      run_examples_gpu,
-      run_pipelines_torch_gpu,
+      # run_tests_single_gpu,
+      # run_tests_multi_gpu,
+      # run_examples_gpu,
+      # run_pipelines_torch_gpu,
       run_tests_torch_deepspeed_gpu
     ]
     steps:
@@ -469,10 +469,10 @@ jobs:
       check_runner_status,
       check_runners,
       setup,
-      run_tests_single_gpu,
-      run_tests_multi_gpu,
-      run_examples_gpu,
-      run_pipelines_torch_gpu,
+      # run_tests_single_gpu,
+      # run_tests_multi_gpu,
+      # run_examples_gpu,
+      # run_pipelines_torch_gpu,
       run_tests_torch_deepspeed_gpu,
       run_extract_warnings
     ]

From fa82a9c747bbc2acc1b2f00fc6092159d8fdfbb2 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Wed, 6 Dec 2023 10:42:57 +0100
Subject: [PATCH 30/33] trigger


From ecb92392c6b7f10acb27d7691972c7191fc3962b Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Wed, 6 Dec 2023 12:53:01 +0100
Subject: [PATCH 31/33] add sklearn dependency to fix slow tests

---
 docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
index e309d555306bdb..1fa384dfa2bc03 100644
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@@ -13,6 +13,7 @@ RUN apt update && \
     git \
     # These are required to build deepspeed.
     python3-dev \
+    python-is-python3 \
     rocrand-dev \
     rocthrust-dev \
     hipsparse-dev \
@@ -35,7 +36,7 @@ WORKDIR /
 ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
 
-RUN python3 -m pip install --no-cache-dir ./transformers[accelerate,testing,sentencepiece]
+RUN python3 -m pip install --no-cache-dir ./transformers[accelerate,testing,sentencepiece,sklearn]
 
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.

From cfcc312b4e83c7ab665032d40cfca7ed9c02e443 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Wed, 6 Dec 2023 15:59:49 +0100
Subject: [PATCH 32/33] enable back other tests

---
 .../self-scheduled-amd-mi210-caller.yml       |   2 +-
 .../self-scheduled-amd-mi250-caller.yml       |   2 +-
 .github/workflows/self-scheduled-amd.yml      | 512 +++++++++---------
 3 files changed, 258 insertions(+), 258 deletions(-)

diff --git a/.github/workflows/self-scheduled-amd-mi210-caller.yml b/.github/workflows/self-scheduled-amd-mi210-caller.yml
index ceaba454ae642e..68294abfa2a730 100644
--- a/.github/workflows/self-scheduled-amd-mi210-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml
@@ -7,7 +7,7 @@ on:
     types: [completed]
   push:
     branches:
-      - run_amd_scheduled_ci_caller*
+      - run_amd_scheduled_ci_caller__*
 
 jobs:
   run_amd_ci:
diff --git a/.github/workflows/self-scheduled-amd-mi250-caller.yml b/.github/workflows/self-scheduled-amd-mi250-caller.yml
index 843e3476342e9d..748c6ec0d18711 100644
--- a/.github/workflows/self-scheduled-amd-mi250-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml
@@ -7,7 +7,7 @@ on:
     types: [completed]
   push:
     branches:
-      - run_amd_scheduled_ci_caller*
+      - run_amd_scheduled_ci_caller__*
 
 jobs:
   run_amd_ci:
diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml
index 1ac5ef980c95e2..43d59b90e3b152 100644
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -107,254 +107,254 @@ jobs:
         run: |
           python3 utils/print_env.py
 
-  # run_tests_single_gpu:
-  #   name: Single GPU tests
-  #   strategy:
-  #     max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
-  #     fail-fast: false
-  #     matrix:
-  #       folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-  #       machine_type: [single-gpu]
-  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-  #   container:
-  #     image: huggingface/transformers-pytorch-amd-gpu
-  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-  #   needs: setup
-  #   steps:
-  #     - name: Echo folder ${{ matrix.folders }}
-  #       shell: bash
-  #       # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-  #       # set the artifact folder names (because the character `/` is not allowed).
-  #       run: |
-  #         echo "${{ matrix.folders }}"
-  #         matrix_folders=${{ matrix.folders }}
-  #         matrix_folders=${matrix_folders/'models/'/'models_'}
-  #         echo "$matrix_folders"
-  #         echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-  #     - name: Update clone
-  #       working-directory: /transformers
-  #       run: git fetch && git checkout ${{ github.sha }}
-
-  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-  #       working-directory: /transformers
-  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-  #     - name: ROCM-SMI
-  #       run: |
-  #         rocm-smi
-  #     - name: ROCM-INFO
-  #       run: |
-  #         rocminfo  | grep "Agent" -A 14
-  #     - name: Show ROCR environment
-  #       run: |
-  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-  #     - name: Environment
-  #       working-directory: /transformers
-  #       run: |
-  #         python3 utils/print_env.py
-
-  #     - name: Show installed libraries and their versions
-  #       working-directory: /transformers
-  #       run: pip freeze
-
-  #     - name: Run all tests on GPU
-  #       working-directory: /transformers
-  #       run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
-
-  #     - name: Failure short reports
-  #       if: ${{ failure() }}
-  #       continue-on-error: true
-  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
-
-  #     - name: Test suite reports artifacts
-  #       if: ${{ always() }}
-  #       uses: actions/upload-artifact@v3
-  #       with:
-  #         name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
-  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
-
-  # run_tests_multi_gpu:
-  #   name: Multi GPU tests
-  #   strategy:
-  #     max-parallel: 1
-  #     fail-fast: false
-  #     matrix:
-  #       folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-  #       machine_type: [multi-gpu]
-  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-  #   container:
-  #     image: huggingface/transformers-pytorch-amd-gpu
-  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-  #   needs: setup
-  #   steps:
-  #     - name: Echo folder ${{ matrix.folders }}
-  #       shell: bash
-  #       # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-  #       # set the artifact folder names (because the character `/` is not allowed).
-  #       run: |
-  #         echo "${{ matrix.folders }}"
-  #         matrix_folders=${{ matrix.folders }}
-  #         matrix_folders=${matrix_folders/'models/'/'models_'}
-  #         echo "$matrix_folders"
-  #         echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-  #     - name: Update clone
-  #       working-directory: /transformers
-  #       run: git fetch && git checkout ${{ github.sha }}
-
-  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-  #       working-directory: /transformers
-  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-  #     - name: ROCM-SMI
-  #       run: |
-  #         rocm-smi
-  #     - name: ROCM-INFO
-  #       run: |
-  #         rocminfo  | grep "Agent" -A 14
-  #     - name: Show ROCR environment
-  #       run: |
-  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-  #     - name: Environment
-  #       working-directory: /transformers
-  #       run: |
-  #         python3 utils/print_env.py
-
-  #     - name: Show installed libraries and their versions
-  #       working-directory: /transformers
-  #       run: pip freeze
-
-  #     - name: Run all tests on GPU
-  #       working-directory: /transformers
-  #       run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
-
-  #     - name: Failure short reports
-  #       if: ${{ failure() }}
-  #       continue-on-error: true
-  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
-
-  #     - name: Test suite reports artifacts
-  #       if: ${{ always() }}
-  #       uses: actions/upload-artifact@v3
-  #       with:
-  #         name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
-  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
-
-  # run_examples_gpu:
-  #   name: Examples tests
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       machine_type: [single-gpu]
-  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-  #   container:
-  #     image: huggingface/transformers-pytorch-amd-gpu
-  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-  #   needs: setup
-  #   steps:
-  #     - name: Update clone
-  #       working-directory: /transformers
-  #       run: git fetch && git checkout ${{ github.sha }}
-
-  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-  #       working-directory: /transformers
-  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-  #     - name: ROCM-SMI
-  #       run: |
-  #         rocm-smi
-  #     - name: ROCM-INFO
-  #       run: |
-  #         rocminfo  | grep "Agent" -A 14
-  #     - name: Show ROCR environment
-  #       run: |
-  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-  #     - name: Environment
-  #       working-directory: /transformers
-  #       run: |
-  #         python3 utils/print_env.py
-
-  #     - name: Show installed libraries and their versions
-  #       working-directory: /transformers
-  #       run: pip freeze
-
-  #     - name: Run examples tests on GPU
-  #       working-directory: /transformers
-  #       run: |
-  #         pip install -r examples/pytorch/_tests_requirements.txt
-  #         python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
-
-  #     - name: Failure short reports
-  #       if: ${{ failure() }}
-  #       continue-on-error: true
-  #       run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
-
-  #     - name: Test suite reports artifacts
-  #       if: ${{ always() }}
-  #       uses: actions/upload-artifact@v3
-  #       with:
-  #         name: ${{ matrix.machine_type }}_run_examples_gpu
-  #         path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
-
-  # run_pipelines_torch_gpu:
-  #   name: PyTorch pipelines tests
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       machine_type: [single-gpu, multi-gpu]
-  #   runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-  #   container:
-  #     image: huggingface/transformers-pytorch-amd-gpu
-  #     options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-  #   needs: setup
-  #   steps:
-  #     - name: Update clone
-  #       working-directory: /transformers
-  #       run: git fetch && git checkout ${{ github.sha }}
-
-  #     - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-  #       working-directory: /transformers
-  #       run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-  #     - name: ROCM-SMI
-  #       run: |
-  #         rocm-smi
-  #     - name: ROCM-INFO
-  #       run: |
-  #         rocminfo  | grep "Agent" -A 14
-  #     - name: Show ROCR environment
-  #       run: |
-  #         echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-  #     - name: Environment
-  #       working-directory: /transformers
-  #       run: |
-  #         python3 utils/print_env.py
-
-  #     - name: Show installed libraries and their versions
-  #       working-directory: /transformers
-  #       run: pip freeze
-
-  #     - name: Run all pipeline tests on GPU
-  #       working-directory: /transformers
-  #       run: |
-  #         python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
-
-  #     - name: Failure short reports
-  #       if: ${{ failure() }}
-  #       continue-on-error: true
-  #       run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
-
-  #     - name: Test suite reports artifacts
-  #       if: ${{ always() }}
-  #       uses: actions/upload-artifact@v3
-  #       with:
-  #         name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
-  #         path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
+  run_tests_single_gpu:
+    name: Single GPU tests
+    strategy:
+      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+        machine_type: [single-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  run_tests_multi_gpu:
+    name: Multi GPU tests
+    strategy:
+      max-parallel: 1
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+        machine_type: [multi-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  run_examples_gpu:
+    name: Examples tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run examples tests on GPU
+        working-directory: /transformers
+        run: |
+          pip install -r examples/pytorch/_tests_requirements.txt
+          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_examples_gpu
+          path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
+
+  run_pipelines_torch_gpu:
+    name: PyTorch pipelines tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all pipeline tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
 
   run_tests_torch_deepspeed_gpu:
     name: Torch ROCm deepspeed tests
@@ -420,10 +420,10 @@ jobs:
       check_runner_status,
       check_runners,
       setup,
-      # run_tests_single_gpu,
-      # run_tests_multi_gpu,
-      # run_examples_gpu,
-      # run_pipelines_torch_gpu,
+      run_tests_single_gpu,
+      run_tests_multi_gpu,
+      run_examples_gpu,
+      run_pipelines_torch_gpu,
       run_tests_torch_deepspeed_gpu
     ]
     steps:
@@ -469,10 +469,10 @@ jobs:
       check_runner_status,
       check_runners,
       setup,
-      # run_tests_single_gpu,
-      # run_tests_multi_gpu,
-      # run_examples_gpu,
-      # run_pipelines_torch_gpu,
+      run_tests_single_gpu,
+      run_tests_multi_gpu,
+      run_examples_gpu,
+      run_pipelines_torch_gpu,
       run_tests_torch_deepspeed_gpu,
       run_extract_warnings
     ]

From ae82b3fd526fe10a9cab96df164e9fd8cc9d0b15 Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Thu, 7 Dec 2023 15:43:15 +0100
Subject: [PATCH 33/33] final update

---
 .github/workflows/self-scheduled-amd-mi210-caller.yml | 2 +-
 .github/workflows/self-scheduled-amd-mi250-caller.yml | 2 +-
 .github/workflows/self-scheduled-amd.yml              | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/self-scheduled-amd-mi210-caller.yml b/.github/workflows/self-scheduled-amd-mi210-caller.yml
index 68294abfa2a730..ceaba454ae642e 100644
--- a/.github/workflows/self-scheduled-amd-mi210-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml
@@ -7,7 +7,7 @@ on:
     types: [completed]
   push:
     branches:
-      - run_amd_scheduled_ci_caller__*
+      - run_amd_scheduled_ci_caller*
 
 jobs:
   run_amd_ci:
diff --git a/.github/workflows/self-scheduled-amd-mi250-caller.yml b/.github/workflows/self-scheduled-amd-mi250-caller.yml
index 748c6ec0d18711..843e3476342e9d 100644
--- a/.github/workflows/self-scheduled-amd-mi250-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml
@@ -7,7 +7,7 @@ on:
     types: [completed]
   push:
     branches:
-      - run_amd_scheduled_ci_caller__*
+      - run_amd_scheduled_ci_caller*
 
 jobs:
   run_amd_ci:
diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml
index 43d59b90e3b152..3d41a3b95e6c50 100644
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -366,7 +366,7 @@ jobs:
     runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
     needs: setup
     container:
-      image: echarlaix/amd-deepspeed-test # TODO: replace with huggingface/transformers-pytorch-deepspeed-amd-gpu
+      image: huggingface/transformers-pytorch-deepspeed-amd-gpu
       options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
       - name: Update clone
@@ -383,6 +383,7 @@ jobs:
       - name: ROCM-INFO
         run: |
           rocminfo  | grep "Agent" -A 14
+
       - name: Show ROCR environment
         run: |
           echo "ROCR: $ROCR_VISIBLE_DEVICES"