From 1e8ce6607b49ca32c009e4e86467df8ae11a84b6 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 21 Nov 2023 15:37:00 +0100 Subject: [PATCH 01/33] add deepspeed scheduled test for amd --- .github/workflows/self-scheduled-amd.yml | 60 +++++++++++++++++++++++- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 17e907e40a5757..774d814883e057 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -356,6 +356,62 @@ jobs: name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu + run_all_tests_torch_rocm_deepspeed_gpu: + name: Torch ROCm deepspeed tests + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + needs: setup + container: + image: huggingface/transformers-pytorch-amd-gpu + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[deepspeed] + + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /transformers + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_rocm_deepspeed_gpu tests/deepspeed tests/extended + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_rocm_deepspeed_gpu/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_tests_torch_rocm_deepspeed_gpu_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_rocm_deepspeed_gpu + run_extract_warnings: name: Extract warnings in CI artifacts runs-on: ubuntu-22.04 @@ -368,7 +424,7 @@ jobs: run_tests_multi_gpu, run_examples_gpu, run_pipelines_torch_gpu, - # run_all_tests_torch_cuda_extensions_gpu + run_all_tests_torch_rocm_deepspeed_gpu ] steps: - name: Checkout transformers @@ -417,7 +473,7 @@ jobs: run_tests_multi_gpu, run_examples_gpu, run_pipelines_torch_gpu, - # run_all_tests_torch_cuda_extensions_gpu, + run_all_tests_torch_rocm_deepspeed_gpu, run_extract_warnings ] steps: From bf276ed0363fe817d9d11dfb9058b4fc094caa07 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Wed, 22 Nov 2023 00:32:51 +0100 Subject: [PATCH 02/33] fix image --- .github/workflows/self-scheduled-amd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 774d814883e057..0bf9c092c87fd5 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -366,7 +366,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] needs: setup container: - image: huggingface/transformers-pytorch-amd-gpu + image: huggingface/transformers-pytorch-amd-gpu-test options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Update clone From 2cfb53d1d5486e7e841e6460b3a4208d7a6a97ec Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 23 Nov 2023 17:55:33 +0100 Subject: [PATCH 03/33] add dockerfile --- .github/workflows/build-docker-images.yml | 493 +++++++++-------- .github/workflows/self-scheduled-amd.yml | 496 +++++++++--------- .../Dockerfile | 19 + 3 files changed, 531 insertions(+), 477 deletions(-) create mode 100644 docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index b267ad7882d89f..6c567566d4d64d 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -4,6 +4,7 @@ on: push: branches: - build_ci_docker_image* + # - run_amd_scheduled_ci_caller_deepspeed_test repository_dispatch: workflow_call: inputs: @@ -18,118 +19,280 @@ concurrency: cancel-in-progress: false jobs: - latest-docker: - name: "Latest PyTorch + TensorFlow [dev]" - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-all-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} - # Push CI images still need to be re-built daily - - - name: Build and push (for Push CI) in a daily basis - # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - if: inputs.image_postfix != '-push-ci' - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-all-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-all-latest-gpu-push-ci + # latest-docker: + # name: "Latest PyTorch + TensorFlow [dev]" + # runs-on: ubuntu-22.04 + # steps: + # - name: Cleanup disk + # run: | + # sudo ls -l /usr/local/lib/ + # sudo ls -l /usr/share/ + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # sudo rm -rf /usr/local/lib/android + # sudo rm -rf /usr/share/dotnet + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-all-latest-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} + # # Push CI images still need to be re-built daily + # - + # name: Build and push (for Push CI) in a daily basis + # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + # if: inputs.image_postfix != '-push-ci' + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-all-latest-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-all-latest-gpu-push-ci + + # latest-torch-deepspeed-docker: + # name: "Latest PyTorch + DeepSpeed" + # runs-on: ubuntu-22.04 + # steps: + # - name: Cleanup disk + # run: | + # sudo ls -l /usr/local/lib/ + # sudo ls -l /usr/share/ + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # sudo rm -rf /usr/local/lib/android + # sudo rm -rf /usr/share/dotnet + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-deepspeed-latest-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} + + # # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) + # latest-torch-deepspeed-docker-for-push-ci-daily-build: + # name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" + # runs-on: ubuntu-22.04 + # steps: + # - name: Cleanup disk + # run: | + # sudo ls -l /usr/local/lib/ + # sudo ls -l /usr/share/ + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # sudo rm -rf /usr/local/lib/android + # sudo rm -rf /usr/share/dotnet + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # # Push CI images still need to be re-built daily + # - + # name: Build and push (for Push CI) in a daily basis + # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + # if: inputs.image_postfix != '-push-ci' + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-deepspeed-latest-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci + + # doc-builder: + # name: "Doc builder" + # # Push CI doesn't need this image + # if: inputs.image_postfix != '-push-ci' + # runs-on: ubuntu-22.04 + # steps: + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-doc-builder + # push: true + # tags: huggingface/transformers-doc-builder + + # latest-pytorch: + # name: "Latest PyTorch [dev]" + # # Push CI doesn't need this image + # if: inputs.image_postfix != '-push-ci' + # runs-on: ubuntu-22.04 + # steps: + # - name: Cleanup disk + # run: | + # sudo ls -l /usr/local/lib/ + # sudo ls -l /usr/share/ + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # sudo rm -rf /usr/local/lib/android + # sudo rm -rf /usr/share/dotnet + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-gpu + + # latest-pytorch-amd: + # name: "Latest PyTorch (AMD) [dev]" + # runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] + # steps: + # - name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - name: Check out code + # uses: actions/checkout@v3 + # - name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-amd-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }} + # # Push CI images still need to be re-built daily + # - name: Build and push (for Push CI) in a daily basis + # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + # if: inputs.image_postfix != '-push-ci' + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-amd-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-amd-gpu-push-ci - latest-torch-deepspeed-docker: - name: "Latest PyTorch + DeepSpeed" - runs-on: ubuntu-22.04 + # latest-tensorflow: + # name: "Latest TensorFlow [dev]" + # # Push CI doesn't need this image + # if: inputs.image_postfix != '-push-ci' + # runs-on: ubuntu-22.04 + # steps: + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-tensorflow-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-tensorflow-gpu + + latest-pytorch-deepspeed-amd: + name: "PyTorch + DeepSpeed (AMD)" + + runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - - - name: Check out code + - name: Check out code uses: actions/checkout@v3 - - - name: Login to DockerHub + - name: Login to DockerHub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push + - name: Build and push uses: docker/build-push-action@v5 with: - context: ./docker/transformers-pytorch-deepspeed-latest-gpu + context: ./docker/transformers-pytorch-deepspeed-amd-gpu build-args: | REF=main push: true - tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} - - # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) - latest-torch-deepspeed-docker-for-push-ci-daily-build: - name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} + tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}-test # Push CI images still need to be re-built daily - name: Build and push (for Push CI) in a daily basis @@ -138,136 +301,8 @@ jobs: if: inputs.image_postfix != '-push-ci' uses: docker/build-push-action@v5 with: - context: ./docker/transformers-pytorch-deepspeed-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci - - doc-builder: - name: "Doc builder" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-22.04 - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-doc-builder - push: true - tags: huggingface/transformers-doc-builder - - latest-pytorch: - name: "Latest PyTorch [dev]" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-gpu - -# Need to be fixed with the help from Guillaume. -# latest-pytorch-amd: -# name: "Latest PyTorch (AMD) [dev]" -# runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] -# steps: -# - name: Set up Docker Buildx -# uses: docker/setup-buildx-action@v3 -# - name: Check out code -# uses: actions/checkout@v3 -# - name: Login to DockerHub -# uses: docker/login-action@v3 -# with: -# username: ${{ secrets.DOCKERHUB_USERNAME }} -# password: ${{ secrets.DOCKERHUB_PASSWORD }} -# - name: Build and push -# uses: docker/build-push-action@v5 -# with: -# context: ./docker/transformers-pytorch-amd-gpu -# build-args: | -# REF=main -# push: true -# tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }} -# # Push CI images still need to be re-built daily -# - -# name: Build and push (for Push CI) in a daily basis -# # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. -# # The later case is useful for manual image building for debugging purpose. Use another tag in this case! -# if: inputs.image_postfix != '-push-ci' -# uses: docker/build-push-action@v5 -# with: -# context: ./docker/transformers-pytorch-amd-gpu -# build-args: | -# REF=main -# push: true -# tags: huggingface/transformers-pytorch-amd-gpu-push-ci - - latest-tensorflow: - name: "Latest TensorFlow [dev]" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-22.04 - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-tensorflow-gpu + context: ./docker/transformers-pytorch-deepspeed-amd-gpu build-args: | REF=main push: true - tags: huggingface/transformers-tensorflow-gpu + tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci-test \ No newline at end of file diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 0bf9c092c87fd5..2cc44f553f4944 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -107,254 +107,254 @@ jobs: run: | python3 utils/print_env.py - run_tests_single_gpu: - name: Single GPU tests - strategy: - max-parallel: 1 # For now, not to parallelize. Can change later if it works well. - fail-fast: false - matrix: - folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - run_tests_multi_gpu: - name: Multi GPU tests - strategy: - max-parallel: 1 - fail-fast: false - matrix: - folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - run_examples_gpu: - name: Examples tests - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run examples tests on GPU - working-directory: /transformers - run: | - pip install -r examples/pytorch/_tests_requirements.txt - python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_examples_gpu - path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu - - run_pipelines_torch_gpu: - name: PyTorch pipelines tests - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all pipeline tests on GPU - working-directory: /transformers - run: | - python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu - path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu + # run_tests_single_gpu: + # name: Single GPU tests + # strategy: + # max-parallel: 1 # For now, not to parallelize. Can change later if it works well. + # fail-fast: false + # matrix: + # folders: ${{ fromJson(needs.setup.outputs.matrix) }} + # machine_type: [single-gpu] + # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + # container: + # image: huggingface/transformers-pytorch-amd-gpu + # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + # needs: setup + # steps: + # - name: Echo folder ${{ matrix.folders }} + # shell: bash + # # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # # set the artifact folder names (because the character `/` is not allowed). + # run: | + # echo "${{ matrix.folders }}" + # matrix_folders=${{ matrix.folders }} + # matrix_folders=${matrix_folders/'models/'/'models_'} + # echo "$matrix_folders" + # echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + # - name: Update clone + # working-directory: /transformers + # run: git fetch && git checkout ${{ github.sha }} + + # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + # working-directory: /transformers + # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + # - name: ROCM-SMI + # run: | + # rocm-smi + # - name: ROCM-INFO + # run: | + # rocminfo | grep "Agent" -A 14 + # - name: Show ROCR environment + # run: | + # echo "ROCR: $ROCR_VISIBLE_DEVICES" + + # - name: Environment + # working-directory: /transformers + # run: | + # python3 utils/print_env.py + + # - name: Show installed libraries and their versions + # working-directory: /transformers + # run: pip freeze + + # - name: Run all tests on GPU + # working-directory: /transformers + # run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + + # - name: Failure short reports + # if: ${{ failure() }} + # continue-on-error: true + # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + # - name: Test suite reports artifacts + # if: ${{ always() }} + # uses: actions/upload-artifact@v3 + # with: + # name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + # path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + + # run_tests_multi_gpu: + # name: Multi GPU tests + # strategy: + # max-parallel: 1 + # fail-fast: false + # matrix: + # folders: ${{ fromJson(needs.setup.outputs.matrix) }} + # machine_type: [multi-gpu] + # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + # container: + # image: huggingface/transformers-pytorch-amd-gpu + # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + # needs: setup + # steps: + # - name: Echo folder ${{ matrix.folders }} + # shell: bash + # # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # # set the artifact folder names (because the character `/` is not allowed). + # run: | + # echo "${{ matrix.folders }}" + # matrix_folders=${{ matrix.folders }} + # matrix_folders=${matrix_folders/'models/'/'models_'} + # echo "$matrix_folders" + # echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + # - name: Update clone + # working-directory: /transformers + # run: git fetch && git checkout ${{ github.sha }} + + # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + # working-directory: /transformers + # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + # - name: ROCM-SMI + # run: | + # rocm-smi + # - name: ROCM-INFO + # run: | + # rocminfo | grep "Agent" -A 14 + # - name: Show ROCR environment + # run: | + # echo "ROCR: $ROCR_VISIBLE_DEVICES" + + # - name: Environment + # working-directory: /transformers + # run: | + # python3 utils/print_env.py + + # - name: Show installed libraries and their versions + # working-directory: /transformers + # run: pip freeze + + # - name: Run all tests on GPU + # working-directory: /transformers + # run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + + # - name: Failure short reports + # if: ${{ failure() }} + # continue-on-error: true + # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + # - name: Test suite reports artifacts + # if: ${{ always() }} + # uses: actions/upload-artifact@v3 + # with: + # name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + # path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + + # run_examples_gpu: + # name: Examples tests + # strategy: + # fail-fast: false + # matrix: + # machine_type: [single-gpu] + # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + # container: + # image: huggingface/transformers-pytorch-amd-gpu + # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + # needs: setup + # steps: + # - name: Update clone + # working-directory: /transformers + # run: git fetch && git checkout ${{ github.sha }} + + # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + # working-directory: /transformers + # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + # - name: ROCM-SMI + # run: | + # rocm-smi + # - name: ROCM-INFO + # run: | + # rocminfo | grep "Agent" -A 14 + # - name: Show ROCR environment + # run: | + # echo "ROCR: $ROCR_VISIBLE_DEVICES" + + # - name: Environment + # working-directory: /transformers + # run: | + # python3 utils/print_env.py + + # - name: Show installed libraries and their versions + # working-directory: /transformers + # run: pip freeze + + # - name: Run examples tests on GPU + # working-directory: /transformers + # run: | + # pip install -r examples/pytorch/_tests_requirements.txt + # python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch + + # - name: Failure short reports + # if: ${{ failure() }} + # continue-on-error: true + # run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt + + # - name: Test suite reports artifacts + # if: ${{ always() }} + # uses: actions/upload-artifact@v3 + # with: + # name: ${{ matrix.machine_type }}_run_examples_gpu + # path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu + + # run_pipelines_torch_gpu: + # name: PyTorch pipelines tests + # strategy: + # fail-fast: false + # matrix: + # machine_type: [single-gpu, multi-gpu] + # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + # container: + # image: huggingface/transformers-pytorch-amd-gpu + # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + # needs: setup + # steps: + # - name: Update clone + # working-directory: /transformers + # run: git fetch && git checkout ${{ github.sha }} + + # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + # working-directory: /transformers + # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + # - name: ROCM-SMI + # run: | + # rocm-smi + # - name: ROCM-INFO + # run: | + # rocminfo | grep "Agent" -A 14 + # - name: Show ROCR environment + # run: | + # echo "ROCR: $ROCR_VISIBLE_DEVICES" + + # - name: Environment + # working-directory: /transformers + # run: | + # python3 utils/print_env.py + + # - name: Show installed libraries and their versions + # working-directory: /transformers + # run: pip freeze + + # - name: Run all pipeline tests on GPU + # working-directory: /transformers + # run: | + # python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines + + # - name: Failure short reports + # if: ${{ failure() }} + # continue-on-error: true + # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt + + # - name: Test suite reports artifacts + # if: ${{ always() }} + # uses: actions/upload-artifact@v3 + # with: + # name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu + # path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu run_all_tests_torch_rocm_deepspeed_gpu: name: Torch ROCm deepspeed tests diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile new file mode 100644 index 00000000000000..a5752ebbbeef72 --- /dev/null +++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile @@ -0,0 +1,19 @@ +FROM rocm/deepspeed:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1_DeepSpeed +LABEL maintainer="Hugging Face" + +ARG DEBIAN_FRONTEND=noninteractive + +# ARG PYTORCH='2.0.1' +# ARG ROCM='5.7' + +RUN python3 -m pip install --no-cache-dir --upgrade pip + +ARG REF=main +RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF +RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] + +# When installing in editable mode, `transformers` is not recognized as a package. +# this line must be added in order for python to be aware of transformers. +RUN cd transformers && python3 setup.py develop + +RUN python3 -c "from deepspeed.launcher.runner import main" \ No newline at end of file From 5a9a5296adc6d11aa4500be96f256b21cd0832a1 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 23 Nov 2023 19:18:13 +0100 Subject: [PATCH 04/33] add comment --- .github/workflows/self-scheduled-amd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 2cc44f553f4944..7fe82bda8c9926 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -366,7 +366,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] needs: setup container: - image: huggingface/transformers-pytorch-amd-gpu-test + image: huggingface/transformers-pytorch-amd-gpu-test # replace with huggingface/transformers-pytorch-deepspeed-amd-gpu/ options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Update clone From af46e872c66fdb3ac80ce3ca14017d88483e201f Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 23 Nov 2023 19:25:14 +0100 Subject: [PATCH 05/33] enable tests --- .github/workflows/self-scheduled-amd.yml | 496 +++++++++++------------ 1 file changed, 248 insertions(+), 248 deletions(-) diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 7fe82bda8c9926..410afbd635fa05 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -107,254 +107,254 @@ jobs: run: | python3 utils/print_env.py - # run_tests_single_gpu: - # name: Single GPU tests - # strategy: - # max-parallel: 1 # For now, not to parallelize. Can change later if it works well. - # fail-fast: false - # matrix: - # folders: ${{ fromJson(needs.setup.outputs.matrix) }} - # machine_type: [single-gpu] - # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - # container: - # image: huggingface/transformers-pytorch-amd-gpu - # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - # needs: setup - # steps: - # - name: Echo folder ${{ matrix.folders }} - # shell: bash - # # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # # set the artifact folder names (because the character `/` is not allowed). - # run: | - # echo "${{ matrix.folders }}" - # matrix_folders=${{ matrix.folders }} - # matrix_folders=${matrix_folders/'models/'/'models_'} - # echo "$matrix_folders" - # echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - # - name: Update clone - # working-directory: /transformers - # run: git fetch && git checkout ${{ github.sha }} - - # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - # working-directory: /transformers - # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - # - name: ROCM-SMI - # run: | - # rocm-smi - # - name: ROCM-INFO - # run: | - # rocminfo | grep "Agent" -A 14 - # - name: Show ROCR environment - # run: | - # echo "ROCR: $ROCR_VISIBLE_DEVICES" - - # - name: Environment - # working-directory: /transformers - # run: | - # python3 utils/print_env.py - - # - name: Show installed libraries and their versions - # working-directory: /transformers - # run: pip freeze - - # - name: Run all tests on GPU - # working-directory: /transformers - # run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - # - name: Failure short reports - # if: ${{ failure() }} - # continue-on-error: true - # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - # - name: Test suite reports artifacts - # if: ${{ always() }} - # uses: actions/upload-artifact@v3 - # with: - # name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - # path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - # run_tests_multi_gpu: - # name: Multi GPU tests - # strategy: - # max-parallel: 1 - # fail-fast: false - # matrix: - # folders: ${{ fromJson(needs.setup.outputs.matrix) }} - # machine_type: [multi-gpu] - # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - # container: - # image: huggingface/transformers-pytorch-amd-gpu - # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - # needs: setup - # steps: - # - name: Echo folder ${{ matrix.folders }} - # shell: bash - # # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # # set the artifact folder names (because the character `/` is not allowed). - # run: | - # echo "${{ matrix.folders }}" - # matrix_folders=${{ matrix.folders }} - # matrix_folders=${matrix_folders/'models/'/'models_'} - # echo "$matrix_folders" - # echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - # - name: Update clone - # working-directory: /transformers - # run: git fetch && git checkout ${{ github.sha }} - - # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - # working-directory: /transformers - # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - # - name: ROCM-SMI - # run: | - # rocm-smi - # - name: ROCM-INFO - # run: | - # rocminfo | grep "Agent" -A 14 - # - name: Show ROCR environment - # run: | - # echo "ROCR: $ROCR_VISIBLE_DEVICES" - - # - name: Environment - # working-directory: /transformers - # run: | - # python3 utils/print_env.py - - # - name: Show installed libraries and their versions - # working-directory: /transformers - # run: pip freeze - - # - name: Run all tests on GPU - # working-directory: /transformers - # run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - # - name: Failure short reports - # if: ${{ failure() }} - # continue-on-error: true - # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - # - name: Test suite reports artifacts - # if: ${{ always() }} - # uses: actions/upload-artifact@v3 - # with: - # name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - # path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - # run_examples_gpu: - # name: Examples tests - # strategy: - # fail-fast: false - # matrix: - # machine_type: [single-gpu] - # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - # container: - # image: huggingface/transformers-pytorch-amd-gpu - # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - # needs: setup - # steps: - # - name: Update clone - # working-directory: /transformers - # run: git fetch && git checkout ${{ github.sha }} - - # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - # working-directory: /transformers - # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - # - name: ROCM-SMI - # run: | - # rocm-smi - # - name: ROCM-INFO - # run: | - # rocminfo | grep "Agent" -A 14 - # - name: Show ROCR environment - # run: | - # echo "ROCR: $ROCR_VISIBLE_DEVICES" - - # - name: Environment - # working-directory: /transformers - # run: | - # python3 utils/print_env.py - - # - name: Show installed libraries and their versions - # working-directory: /transformers - # run: pip freeze - - # - name: Run examples tests on GPU - # working-directory: /transformers - # run: | - # pip install -r examples/pytorch/_tests_requirements.txt - # python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch - - # - name: Failure short reports - # if: ${{ failure() }} - # continue-on-error: true - # run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt - - # - name: Test suite reports artifacts - # if: ${{ always() }} - # uses: actions/upload-artifact@v3 - # with: - # name: ${{ matrix.machine_type }}_run_examples_gpu - # path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu - - # run_pipelines_torch_gpu: - # name: PyTorch pipelines tests - # strategy: - # fail-fast: false - # matrix: - # machine_type: [single-gpu, multi-gpu] - # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - # container: - # image: huggingface/transformers-pytorch-amd-gpu - # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - # needs: setup - # steps: - # - name: Update clone - # working-directory: /transformers - # run: git fetch && git checkout ${{ github.sha }} - - # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - # working-directory: /transformers - # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - # - name: ROCM-SMI - # run: | - # rocm-smi - # - name: ROCM-INFO - # run: | - # rocminfo | grep "Agent" -A 14 - # - name: Show ROCR environment - # run: | - # echo "ROCR: $ROCR_VISIBLE_DEVICES" - - # - name: Environment - # working-directory: /transformers - # run: | - # python3 utils/print_env.py - - # - name: Show installed libraries and their versions - # working-directory: /transformers - # run: pip freeze - - # - name: Run all pipeline tests on GPU - # working-directory: /transformers - # run: | - # python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines - - # - name: Failure short reports - # if: ${{ failure() }} - # continue-on-error: true - # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt - - # - name: Test suite reports artifacts - # if: ${{ always() }} - # uses: actions/upload-artifact@v3 - # with: - # name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu - # path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu + run_tests_single_gpu: + name: Single GPU tests + strategy: + max-parallel: 1 # For now, not to parallelize. Can change later if it works well. + fail-fast: false + matrix: + folders: ${{ fromJson(needs.setup.outputs.matrix) }} + machine_type: [single-gpu] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /transformers + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + + run_tests_multi_gpu: + name: Multi GPU tests + strategy: + max-parallel: 1 + fail-fast: false + matrix: + folders: ${{ fromJson(needs.setup.outputs.matrix) }} + machine_type: [multi-gpu] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /transformers + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + + run_examples_gpu: + name: Examples tests + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run examples tests on GPU + working-directory: /transformers + run: | + pip install -r examples/pytorch/_tests_requirements.txt + python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_examples_gpu + path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu + + run_pipelines_torch_gpu: + name: PyTorch pipelines tests + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all pipeline tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu + path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu run_all_tests_torch_rocm_deepspeed_gpu: name: Torch ROCm deepspeed tests From c29d2492a5dafd60d5359a6f5b3138cfc3e41c6d Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 27 Nov 2023 18:54:31 +0100 Subject: [PATCH 06/33] trigger --- .github/workflows/build-docker-images.yml | 271 +----------------- .../Dockerfile | 9 +- 2 files changed, 10 insertions(+), 270 deletions(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index 6c567566d4d64d..201ea127ba07f2 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -4,7 +4,7 @@ on: push: branches: - build_ci_docker_image* - # - run_amd_scheduled_ci_caller_deepspeed_test + - run_amd_scheduled_ci_caller_deepspeed_test repository_dispatch: workflow_call: inputs: @@ -19,262 +19,10 @@ concurrency: cancel-in-progress: false jobs: - # latest-docker: - # name: "Latest PyTorch + TensorFlow [dev]" - # runs-on: ubuntu-22.04 - # steps: - # - name: Cleanup disk - # run: | - # sudo ls -l /usr/local/lib/ - # sudo ls -l /usr/share/ - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # sudo rm -rf /usr/local/lib/android - # sudo rm -rf /usr/share/dotnet - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - - # name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-all-latest-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} - # # Push CI images still need to be re-built daily - # - - # name: Build and push (for Push CI) in a daily basis - # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - # if: inputs.image_postfix != '-push-ci' - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-all-latest-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-all-latest-gpu-push-ci - - # latest-torch-deepspeed-docker: - # name: "Latest PyTorch + DeepSpeed" - # runs-on: ubuntu-22.04 - # steps: - # - name: Cleanup disk - # run: | - # sudo ls -l /usr/local/lib/ - # sudo ls -l /usr/share/ - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # sudo rm -rf /usr/local/lib/android - # sudo rm -rf /usr/share/dotnet - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - - # name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-pytorch-deepspeed-latest-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} - - # # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) - # latest-torch-deepspeed-docker-for-push-ci-daily-build: - # name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" - # runs-on: ubuntu-22.04 - # steps: - # - name: Cleanup disk - # run: | - # sudo ls -l /usr/local/lib/ - # sudo ls -l /usr/share/ - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # sudo rm -rf /usr/local/lib/android - # sudo rm -rf /usr/share/dotnet - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # # Push CI images still need to be re-built daily - # - - # name: Build and push (for Push CI) in a daily basis - # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - # if: inputs.image_postfix != '-push-ci' - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-pytorch-deepspeed-latest-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci - - # doc-builder: - # name: "Doc builder" - # # Push CI doesn't need this image - # if: inputs.image_postfix != '-push-ci' - # runs-on: ubuntu-22.04 - # steps: - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - - # name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-doc-builder - # push: true - # tags: huggingface/transformers-doc-builder - - # latest-pytorch: - # name: "Latest PyTorch [dev]" - # # Push CI doesn't need this image - # if: inputs.image_postfix != '-push-ci' - # runs-on: ubuntu-22.04 - # steps: - # - name: Cleanup disk - # run: | - # sudo ls -l /usr/local/lib/ - # sudo ls -l /usr/share/ - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # sudo rm -rf /usr/local/lib/android - # sudo rm -rf /usr/share/dotnet - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - - # name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-pytorch-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-pytorch-gpu - - # latest-pytorch-amd: - # name: "Latest PyTorch (AMD) [dev]" - # runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] - # steps: - # - name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - name: Check out code - # uses: actions/checkout@v3 - # - name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-pytorch-amd-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }} - # # Push CI images still need to be re-built daily - # - name: Build and push (for Push CI) in a daily basis - # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - # if: inputs.image_postfix != '-push-ci' - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-pytorch-amd-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-pytorch-amd-gpu-push-ci - - # latest-tensorflow: - # name: "Latest TensorFlow [dev]" - # # Push CI doesn't need this image - # if: inputs.image_postfix != '-push-ci' - # runs-on: ubuntu-22.04 - # steps: - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - - # name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-tensorflow-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-tensorflow-gpu - latest-pytorch-deepspeed-amd: - name: "PyTorch + DeepSpeed (AMD)" + name: "PyTorch + DeepSpeed (AMD) [dev]" - runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] + runs-on: [self-hosted, yih-dar-shieh-debug-daily] steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -293,16 +41,3 @@ jobs: REF=main push: true tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}-test - # Push CI images still need to be re-built daily - - - name: Build and push (for Push CI) in a daily basis - # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - if: inputs.image_postfix != '-push-ci' - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-deepspeed-amd-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci-test \ No newline at end of file diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile index a5752ebbbeef72..574951727c1835 100644 --- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile @@ -2,13 +2,18 @@ FROM rocm/deepspeed:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1_DeepSpeed LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive +ARG PYTORCH='2.0.1' +ARG ROCM='5.7' -# ARG PYTORCH='2.0.1' -# ARG ROCM='5.7' +RUN apt update && \ + apt install -y --no-install-recommends libaio-dev && \ + apt clean && \ + rm -rf /var/lib/apt/lists/* RUN python3 -m pip install --no-cache-dir --upgrade pip ARG REF=main +WORKDIR / RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] From a0c3dafbeee98952996e27ea53734ab736a319e2 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 27 Nov 2023 19:23:02 +0100 Subject: [PATCH 07/33] remove trigger for this branch --- .github/workflows/build-docker-images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index 201ea127ba07f2..c5d5d0402d56a5 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -4,7 +4,7 @@ on: push: branches: - build_ci_docker_image* - - run_amd_scheduled_ci_caller_deepspeed_test + # - run_amd_scheduled_ci_caller_deepspeed_test repository_dispatch: workflow_call: inputs: From 4cb9d6f54fdc225dd213e951026dd9ce1eb159fc Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 28 Nov 2023 12:04:16 +0100 Subject: [PATCH 08/33] trigger --- .github/workflows/build-docker-images.yml | 2 +- .github/workflows/self-scheduled-amd.yml | 528 +++++++++--------- .../Dockerfile | 4 +- 3 files changed, 267 insertions(+), 267 deletions(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index c5d5d0402d56a5..201ea127ba07f2 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -4,7 +4,7 @@ on: push: branches: - build_ci_docker_image* - # - run_amd_scheduled_ci_caller_deepspeed_test + - run_amd_scheduled_ci_caller_deepspeed_test repository_dispatch: workflow_call: inputs: diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 410afbd635fa05..639a4df69c1de1 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -107,256 +107,256 @@ jobs: run: | python3 utils/print_env.py - run_tests_single_gpu: - name: Single GPU tests - strategy: - max-parallel: 1 # For now, not to parallelize. Can change later if it works well. - fail-fast: false - matrix: - folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - run_tests_multi_gpu: - name: Multi GPU tests - strategy: - max-parallel: 1 - fail-fast: false - matrix: - folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - run_examples_gpu: - name: Examples tests - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run examples tests on GPU - working-directory: /transformers - run: | - pip install -r examples/pytorch/_tests_requirements.txt - python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_examples_gpu - path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu - - run_pipelines_torch_gpu: - name: PyTorch pipelines tests - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all pipeline tests on GPU - working-directory: /transformers - run: | - python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu - path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu - - run_all_tests_torch_rocm_deepspeed_gpu: + # run_tests_single_gpu: + # name: Single GPU tests + # strategy: + # max-parallel: 1 # For now, not to parallelize. Can change later if it works well. + # fail-fast: false + # matrix: + # folders: ${{ fromJson(needs.setup.outputs.matrix) }} + # machine_type: [single-gpu] + # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + # container: + # image: huggingface/transformers-pytorch-amd-gpu + # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + # needs: setup + # steps: + # - name: Echo folder ${{ matrix.folders }} + # shell: bash + # # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # # set the artifact folder names (because the character `/` is not allowed). + # run: | + # echo "${{ matrix.folders }}" + # matrix_folders=${{ matrix.folders }} + # matrix_folders=${matrix_folders/'models/'/'models_'} + # echo "$matrix_folders" + # echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + # - name: Update clone + # working-directory: /transformers + # run: git fetch && git checkout ${{ github.sha }} + + # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + # working-directory: /transformers + # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + # - name: ROCM-SMI + # run: | + # rocm-smi + # - name: ROCM-INFO + # run: | + # rocminfo | grep "Agent" -A 14 + # - name: Show ROCR environment + # run: | + # echo "ROCR: $ROCR_VISIBLE_DEVICES" + + # - name: Environment + # working-directory: /transformers + # run: | + # python3 utils/print_env.py + + # - name: Show installed libraries and their versions + # working-directory: /transformers + # run: pip freeze + + # - name: Run all tests on GPU + # working-directory: /transformers + # run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + + # - name: Failure short reports + # if: ${{ failure() }} + # continue-on-error: true + # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + # - name: Test suite reports artifacts + # if: ${{ always() }} + # uses: actions/upload-artifact@v3 + # with: + # name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + # path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + + # run_tests_multi_gpu: + # name: Multi GPU tests + # strategy: + # max-parallel: 1 + # fail-fast: false + # matrix: + # folders: ${{ fromJson(needs.setup.outputs.matrix) }} + # machine_type: [multi-gpu] + # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + # container: + # image: huggingface/transformers-pytorch-amd-gpu + # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + # needs: setup + # steps: + # - name: Echo folder ${{ matrix.folders }} + # shell: bash + # # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # # set the artifact folder names (because the character `/` is not allowed). + # run: | + # echo "${{ matrix.folders }}" + # matrix_folders=${{ matrix.folders }} + # matrix_folders=${matrix_folders/'models/'/'models_'} + # echo "$matrix_folders" + # echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + # - name: Update clone + # working-directory: /transformers + # run: git fetch && git checkout ${{ github.sha }} + + # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + # working-directory: /transformers + # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + # - name: ROCM-SMI + # run: | + # rocm-smi + # - name: ROCM-INFO + # run: | + # rocminfo | grep "Agent" -A 14 + # - name: Show ROCR environment + # run: | + # echo "ROCR: $ROCR_VISIBLE_DEVICES" + + # - name: Environment + # working-directory: /transformers + # run: | + # python3 utils/print_env.py + + # - name: Show installed libraries and their versions + # working-directory: /transformers + # run: pip freeze + + # - name: Run all tests on GPU + # working-directory: /transformers + # run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + + # - name: Failure short reports + # if: ${{ failure() }} + # continue-on-error: true + # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + # - name: Test suite reports artifacts + # if: ${{ always() }} + # uses: actions/upload-artifact@v3 + # with: + # name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + # path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + + # run_examples_gpu: + # name: Examples tests + # strategy: + # fail-fast: false + # matrix: + # machine_type: [single-gpu] + # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + # container: + # image: huggingface/transformers-pytorch-amd-gpu + # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + # needs: setup + # steps: + # - name: Update clone + # working-directory: /transformers + # run: git fetch && git checkout ${{ github.sha }} + + # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + # working-directory: /transformers + # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + # - name: ROCM-SMI + # run: | + # rocm-smi + # - name: ROCM-INFO + # run: | + # rocminfo | grep "Agent" -A 14 + # - name: Show ROCR environment + # run: | + # echo "ROCR: $ROCR_VISIBLE_DEVICES" + + # - name: Environment + # working-directory: /transformers + # run: | + # python3 utils/print_env.py + + # - name: Show installed libraries and their versions + # working-directory: /transformers + # run: pip freeze + + # - name: Run examples tests on GPU + # working-directory: /transformers + # run: | + # pip install -r examples/pytorch/_tests_requirements.txt + # python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch + + # - name: Failure short reports + # if: ${{ failure() }} + # continue-on-error: true + # run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt + + # - name: Test suite reports artifacts + # if: ${{ always() }} + # uses: actions/upload-artifact@v3 + # with: + # name: ${{ matrix.machine_type }}_run_examples_gpu + # path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu + + # run_pipelines_torch_gpu: + # name: PyTorch pipelines tests + # strategy: + # fail-fast: false + # matrix: + # machine_type: [single-gpu, multi-gpu] + # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + # container: + # image: huggingface/transformers-pytorch-amd-gpu + # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + # needs: setup + # steps: + # - name: Update clone + # working-directory: /transformers + # run: git fetch && git checkout ${{ github.sha }} + + # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + # working-directory: /transformers + # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + # - name: ROCM-SMI + # run: | + # rocm-smi + # - name: ROCM-INFO + # run: | + # rocminfo | grep "Agent" -A 14 + # - name: Show ROCR environment + # run: | + # echo "ROCR: $ROCR_VISIBLE_DEVICES" + + # - name: Environment + # working-directory: /transformers + # run: | + # python3 utils/print_env.py + + # - name: Show installed libraries and their versions + # working-directory: /transformers + # run: pip freeze + + # - name: Run all pipeline tests on GPU + # working-directory: /transformers + # run: | + # python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines + + # - name: Failure short reports + # if: ${{ failure() }} + # continue-on-error: true + # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt + + # - name: Test suite reports artifacts + # if: ${{ always() }} + # uses: actions/upload-artifact@v3 + # with: + # name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu + # path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu + + run_tests_torch_deepspeed_gpu: name: Torch ROCm deepspeed tests strategy: fail-fast: false @@ -398,19 +398,19 @@ jobs: - name: Run all tests on GPU working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_rocm_deepspeed_gpu tests/deepspeed tests/extended + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_deepspeed_gpu tests/deepspeed tests/extended - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_rocm_deepspeed_gpu/failures_short.txt + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_tests_torch_rocm_deepspeed_gpu_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_rocm_deepspeed_gpu + name: ${{ matrix.machine_type }}_run_tests_torch_deepspeed_gpu_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu run_extract_warnings: name: Extract warnings in CI artifacts @@ -420,11 +420,11 @@ jobs: check_runner_status, check_runners, setup, - run_tests_single_gpu, - run_tests_multi_gpu, - run_examples_gpu, - run_pipelines_torch_gpu, - run_all_tests_torch_rocm_deepspeed_gpu + # run_tests_single_gpu, + # run_tests_multi_gpu, + # run_examples_gpu, + # run_pipelines_torch_gpu, + run_tests_torch_deepspeed_gpu ] steps: - name: Checkout transformers @@ -469,11 +469,11 @@ jobs: check_runner_status, check_runners, setup, - run_tests_single_gpu, - run_tests_multi_gpu, - run_examples_gpu, - run_pipelines_torch_gpu, - run_all_tests_torch_rocm_deepspeed_gpu, + # run_tests_single_gpu, + # run_tests_multi_gpu, + # run_examples_gpu, + # run_pipelines_torch_gpu, + run_tests_torch_deepspeed_gpu, run_extract_warnings ] steps: diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile index 574951727c1835..cfbb5938fb37ae 100644 --- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM rocm/deepspeed:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1_DeepSpeed +FROM rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive @@ -6,7 +6,7 @@ ARG PYTORCH='2.0.1' ARG ROCM='5.7' RUN apt update && \ - apt install -y --no-install-recommends libaio-dev && \ + apt install -y --no-install-recommends libaio-dev git && \ apt clean && \ rm -rf /var/lib/apt/lists/* From a7033499d83d3aa0a81eb695dcaf1d0f49d7eaab Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 28 Nov 2023 13:53:57 +0100 Subject: [PATCH 09/33] change runner env to trigger the docker build image test --- .github/workflows/build-docker-images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index 201ea127ba07f2..f15e68218c6197 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -22,7 +22,7 @@ jobs: latest-pytorch-deepspeed-amd: name: "PyTorch + DeepSpeed (AMD) [dev]" - runs-on: [self-hosted, yih-dar-shieh-debug-daily] + runs-on: [self-hosted, yih-dar-shieh-debug-doctest] steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 From a47ac2ca1f98f4dc2b1eafeb31855c6b40e1fa56 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 28 Nov 2023 17:13:25 +0100 Subject: [PATCH 10/33] use new docker image --- .github/workflows/build-docker-images.yml | 2 +- .github/workflows/self-scheduled-amd.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index f15e68218c6197..5bf28698f4ec1f 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -4,7 +4,7 @@ on: push: branches: - build_ci_docker_image* - - run_amd_scheduled_ci_caller_deepspeed_test + # - run_amd_scheduled_ci_caller_deepspeed_test repository_dispatch: workflow_call: inputs: diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 639a4df69c1de1..348652560f3d9b 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -366,7 +366,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] needs: setup container: - image: huggingface/transformers-pytorch-amd-gpu-test # replace with huggingface/transformers-pytorch-deepspeed-amd-gpu/ + image: huggingface/transformers-pytorch-deepspeed-amd-gpu-test # remove -test options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Update clone @@ -375,7 +375,7 @@ jobs: - name: Reinstall transformers in edit mode (remove the one installed during docker image build) working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[deepspeed] + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - name: ROCM-SMI run: | From 233bd7f07a6e0fcf92a4558dc1dbc249170ec037 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 28 Nov 2023 23:45:52 +0100 Subject: [PATCH 11/33] remove test suffix from docker image tag --- .github/workflows/build-docker-images.yml | 4 ++-- .github/workflows/self-scheduled-amd-caller.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index 5bf28698f4ec1f..84fdc8ab68f232 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -4,7 +4,7 @@ on: push: branches: - build_ci_docker_image* - # - run_amd_scheduled_ci_caller_deepspeed_test + - run_amd_scheduled_ci_caller_deepspeed_test repository_dispatch: workflow_call: inputs: @@ -40,4 +40,4 @@ jobs: build-args: | REF=main push: true - tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}-test + tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }} diff --git a/.github/workflows/self-scheduled-amd-caller.yml b/.github/workflows/self-scheduled-amd-caller.yml index 4755bd868249ca..883477e458fd65 100644 --- a/.github/workflows/self-scheduled-amd-caller.yml +++ b/.github/workflows/self-scheduled-amd-caller.yml @@ -5,7 +5,7 @@ on: - cron: "17 2 * * *" push: branches: - - run_amd_scheduled_ci_caller* + - run_amd_scheduled_ci_caller__* jobs: run_amd_ci_mi210: From 971ba80a8cfc06110446fcad7ab5d5527c45602a Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Wed, 29 Nov 2023 00:00:14 +0100 Subject: [PATCH 12/33] replace test docker image with original image --- .github/workflows/build-docker-images.yml | 270 +++++++++++++++++- .../workflows/self-scheduled-amd-caller.yml | 2 +- .github/workflows/self-scheduled-amd.yml | 2 +- 3 files changed, 270 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index 84fdc8ab68f232..eb7b172888580b 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -4,7 +4,6 @@ on: push: branches: - build_ci_docker_image* - - run_amd_scheduled_ci_caller_deepspeed_test repository_dispatch: workflow_call: inputs: @@ -19,10 +18,264 @@ concurrency: cancel-in-progress: false jobs: + latest-docker: + name: "Latest PyTorch + TensorFlow [dev]" + runs-on: ubuntu-22.04 + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-all-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} + # Push CI images still need to be re-built daily + - + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-all-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-all-latest-gpu-push-ci + + latest-torch-deepspeed-docker: + name: "Latest PyTorch + DeepSpeed" + runs-on: ubuntu-22.04 + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} + + # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) + latest-torch-deepspeed-docker-for-push-ci-daily-build: + name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" + runs-on: ubuntu-22.04 + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + # Push CI images still need to be re-built daily + - + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci + + doc-builder: + name: "Doc builder" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: ubuntu-22.04 + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-doc-builder + push: true + tags: huggingface/transformers-doc-builder + + latest-pytorch: + name: "Latest PyTorch [dev]" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: ubuntu-22.04 + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-gpu + +# Need to be fixed with the help from Guillaume. +# latest-pytorch-amd: +# name: "Latest PyTorch (AMD) [dev]" +# runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] +# steps: +# - name: Set up Docker Buildx +# uses: docker/setup-buildx-action@v3 +# - name: Check out code +# uses: actions/checkout@v3 +# - name: Login to DockerHub +# uses: docker/login-action@v3 +# with: +# username: ${{ secrets.DOCKERHUB_USERNAME }} +# password: ${{ secrets.DOCKERHUB_PASSWORD }} +# - name: Build and push +# uses: docker/build-push-action@v5 +# with: +# context: ./docker/transformers-pytorch-amd-gpu +# build-args: | +# REF=main +# push: true +# tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }} +# # Push CI images still need to be re-built daily +# - +# name: Build and push (for Push CI) in a daily basis +# # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. +# # The later case is useful for manual image building for debugging purpose. Use another tag in this case! +# if: inputs.image_postfix != '-push-ci' +# uses: docker/build-push-action@v5 +# with: +# context: ./docker/transformers-pytorch-amd-gpu +# build-args: | +# REF=main +# push: true +# tags: huggingface/transformers-pytorch-amd-gpu-push-ci + + latest-tensorflow: + name: "Latest TensorFlow [dev]" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: ubuntu-22.04 + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-tensorflow-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-tensorflow-gpu + latest-pytorch-deepspeed-amd: name: "PyTorch + DeepSpeed (AMD) [dev]" - runs-on: [self-hosted, yih-dar-shieh-debug-doctest] + runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -41,3 +294,16 @@ jobs: REF=main push: true tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }} + # Push CI images still need to be re-built daily + - + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-amd-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci diff --git a/.github/workflows/self-scheduled-amd-caller.yml b/.github/workflows/self-scheduled-amd-caller.yml index 883477e458fd65..4755bd868249ca 100644 --- a/.github/workflows/self-scheduled-amd-caller.yml +++ b/.github/workflows/self-scheduled-amd-caller.yml @@ -5,7 +5,7 @@ on: - cron: "17 2 * * *" push: branches: - - run_amd_scheduled_ci_caller__* + - run_amd_scheduled_ci_caller* jobs: run_amd_ci_mi210: diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 348652560f3d9b..1cc6af0eed8ff2 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -366,7 +366,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] needs: setup container: - image: huggingface/transformers-pytorch-deepspeed-amd-gpu-test # remove -test + image: huggingface/transformers-pytorch-deepspeed-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Update clone From da4774c0481147ebdd6f59109293010fad8cd2bd Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Wed, 29 Nov 2023 19:07:15 +0100 Subject: [PATCH 13/33] push new image --- .github/workflows/build-docker-images.yml | 233 +----------------- .../workflows/self-scheduled-amd-caller.yml | 2 +- .../Dockerfile | 2 + 3 files changed, 6 insertions(+), 231 deletions(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index eb7b172888580b..3dc558cc3e473a 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -4,6 +4,7 @@ on: push: branches: - build_ci_docker_image* + - run_amd_scheduled_ci_caller_deepspeed_test repository_dispatch: workflow_call: inputs: @@ -18,195 +19,6 @@ concurrency: cancel-in-progress: false jobs: - latest-docker: - name: "Latest PyTorch + TensorFlow [dev]" - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-all-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} - # Push CI images still need to be re-built daily - - - name: Build and push (for Push CI) in a daily basis - # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - if: inputs.image_postfix != '-push-ci' - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-all-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-all-latest-gpu-push-ci - - latest-torch-deepspeed-docker: - name: "Latest PyTorch + DeepSpeed" - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-deepspeed-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} - - # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) - latest-torch-deepspeed-docker-for-push-ci-daily-build: - name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - # Push CI images still need to be re-built daily - - - name: Build and push (for Push CI) in a daily basis - # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - if: inputs.image_postfix != '-push-ci' - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-deepspeed-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci - - doc-builder: - name: "Doc builder" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-22.04 - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-doc-builder - push: true - tags: huggingface/transformers-doc-builder - - latest-pytorch: - name: "Latest PyTorch [dev]" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-gpu # Need to be fixed with the help from Guillaume. # latest-pytorch-amd: @@ -244,38 +56,12 @@ jobs: # push: true # tags: huggingface/transformers-pytorch-amd-gpu-push-ci - latest-tensorflow: - name: "Latest TensorFlow [dev]" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-22.04 - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-tensorflow-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-tensorflow-gpu + latest-pytorch-deepspeed-amd: name: "PyTorch + DeepSpeed (AMD) [dev]" - runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] + runs-on: [self-hosted, yih-dar-shieh-debug-doctest] steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -294,16 +80,3 @@ jobs: REF=main push: true tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }} - # Push CI images still need to be re-built daily - - - name: Build and push (for Push CI) in a daily basis - # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - if: inputs.image_postfix != '-push-ci' - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-deepspeed-amd-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci diff --git a/.github/workflows/self-scheduled-amd-caller.yml b/.github/workflows/self-scheduled-amd-caller.yml index 4755bd868249ca..883477e458fd65 100644 --- a/.github/workflows/self-scheduled-amd-caller.yml +++ b/.github/workflows/self-scheduled-amd-caller.yml @@ -5,7 +5,7 @@ on: - cron: "17 2 * * *" push: branches: - - run_amd_scheduled_ci_caller* + - run_amd_scheduled_ci_caller__* jobs: run_amd_ci_mi210: diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile index cfbb5938fb37ae..5090b14f9c283d 100644 --- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile @@ -12,6 +12,8 @@ RUN apt update && \ RUN python3 -m pip install --no-cache-dir --upgrade pip +RUN python3 -m pip uninstall -y apex + ARG REF=main WORKDIR / RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF From cbe995ff2041aa5f481d2543a388864192ea9dba Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 30 Nov 2023 11:19:46 +0100 Subject: [PATCH 14/33] Trigger From e16c271403b94455d418e106cc22e00df778349c Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 30 Nov 2023 11:21:38 +0100 Subject: [PATCH 15/33] add back amd tests --- .github/workflows/build-docker-images.yml | 233 +++++++++++++++++- .../workflows/self-scheduled-amd-caller.yml | 2 +- 2 files changed, 231 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index 3dc558cc3e473a..eb7b172888580b 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -4,7 +4,6 @@ on: push: branches: - build_ci_docker_image* - - run_amd_scheduled_ci_caller_deepspeed_test repository_dispatch: workflow_call: inputs: @@ -19,6 +18,195 @@ concurrency: cancel-in-progress: false jobs: + latest-docker: + name: "Latest PyTorch + TensorFlow [dev]" + runs-on: ubuntu-22.04 + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-all-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} + # Push CI images still need to be re-built daily + - + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-all-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-all-latest-gpu-push-ci + + latest-torch-deepspeed-docker: + name: "Latest PyTorch + DeepSpeed" + runs-on: ubuntu-22.04 + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} + + # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) + latest-torch-deepspeed-docker-for-push-ci-daily-build: + name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" + runs-on: ubuntu-22.04 + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + # Push CI images still need to be re-built daily + - + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci + + doc-builder: + name: "Doc builder" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: ubuntu-22.04 + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-doc-builder + push: true + tags: huggingface/transformers-doc-builder + + latest-pytorch: + name: "Latest PyTorch [dev]" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: ubuntu-22.04 + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-gpu # Need to be fixed with the help from Guillaume. # latest-pytorch-amd: @@ -56,12 +244,38 @@ jobs: # push: true # tags: huggingface/transformers-pytorch-amd-gpu-push-ci - + latest-tensorflow: + name: "Latest TensorFlow [dev]" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: ubuntu-22.04 + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-tensorflow-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-tensorflow-gpu latest-pytorch-deepspeed-amd: name: "PyTorch + DeepSpeed (AMD) [dev]" - runs-on: [self-hosted, yih-dar-shieh-debug-doctest] + runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -80,3 +294,16 @@ jobs: REF=main push: true tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }} + # Push CI images still need to be re-built daily + - + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-amd-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci diff --git a/.github/workflows/self-scheduled-amd-caller.yml b/.github/workflows/self-scheduled-amd-caller.yml index 883477e458fd65..4755bd868249ca 100644 --- a/.github/workflows/self-scheduled-amd-caller.yml +++ b/.github/workflows/self-scheduled-amd-caller.yml @@ -5,7 +5,7 @@ on: - cron: "17 2 * * *" push: branches: - - run_amd_scheduled_ci_caller__* + - run_amd_scheduled_ci_caller* jobs: run_amd_ci_mi210: From 70c3580febcf837848ada43832b6758b2744ad1f Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 30 Nov 2023 11:25:49 +0100 Subject: [PATCH 16/33] fix typo --- docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile index 276f35f3351846..184639552eee7a 100644 --- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile @@ -34,7 +34,7 @@ RUN python3 -m pip uninstall -y torch-tensorrt # recompile apex RUN python3 -m pip uninstall -y apex -RUN git clone https://github.com/NVIDIA/apex +# RUN git clone https://github.com/NVIDIA/apex # `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners # TODO: check if there is alternative way to install latest apex # RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check . From 090b88e20466623a9aa6709487c5817dd6f7d774 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 30 Nov 2023 11:26:04 +0100 Subject: [PATCH 17/33] add amd tests back --- .github/workflows/self-scheduled-amd.yml | 512 +++++++++++------------ 1 file changed, 256 insertions(+), 256 deletions(-) diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 1cc6af0eed8ff2..54ab319897ef2e 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -107,254 +107,254 @@ jobs: run: | python3 utils/print_env.py - # run_tests_single_gpu: - # name: Single GPU tests - # strategy: - # max-parallel: 1 # For now, not to parallelize. Can change later if it works well. - # fail-fast: false - # matrix: - # folders: ${{ fromJson(needs.setup.outputs.matrix) }} - # machine_type: [single-gpu] - # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - # container: - # image: huggingface/transformers-pytorch-amd-gpu - # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - # needs: setup - # steps: - # - name: Echo folder ${{ matrix.folders }} - # shell: bash - # # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # # set the artifact folder names (because the character `/` is not allowed). - # run: | - # echo "${{ matrix.folders }}" - # matrix_folders=${{ matrix.folders }} - # matrix_folders=${matrix_folders/'models/'/'models_'} - # echo "$matrix_folders" - # echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - # - name: Update clone - # working-directory: /transformers - # run: git fetch && git checkout ${{ github.sha }} - - # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - # working-directory: /transformers - # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - # - name: ROCM-SMI - # run: | - # rocm-smi - # - name: ROCM-INFO - # run: | - # rocminfo | grep "Agent" -A 14 - # - name: Show ROCR environment - # run: | - # echo "ROCR: $ROCR_VISIBLE_DEVICES" - - # - name: Environment - # working-directory: /transformers - # run: | - # python3 utils/print_env.py - - # - name: Show installed libraries and their versions - # working-directory: /transformers - # run: pip freeze - - # - name: Run all tests on GPU - # working-directory: /transformers - # run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - # - name: Failure short reports - # if: ${{ failure() }} - # continue-on-error: true - # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - # - name: Test suite reports artifacts - # if: ${{ always() }} - # uses: actions/upload-artifact@v3 - # with: - # name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - # path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - # run_tests_multi_gpu: - # name: Multi GPU tests - # strategy: - # max-parallel: 1 - # fail-fast: false - # matrix: - # folders: ${{ fromJson(needs.setup.outputs.matrix) }} - # machine_type: [multi-gpu] - # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - # container: - # image: huggingface/transformers-pytorch-amd-gpu - # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - # needs: setup - # steps: - # - name: Echo folder ${{ matrix.folders }} - # shell: bash - # # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # # set the artifact folder names (because the character `/` is not allowed). - # run: | - # echo "${{ matrix.folders }}" - # matrix_folders=${{ matrix.folders }} - # matrix_folders=${matrix_folders/'models/'/'models_'} - # echo "$matrix_folders" - # echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - # - name: Update clone - # working-directory: /transformers - # run: git fetch && git checkout ${{ github.sha }} - - # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - # working-directory: /transformers - # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - # - name: ROCM-SMI - # run: | - # rocm-smi - # - name: ROCM-INFO - # run: | - # rocminfo | grep "Agent" -A 14 - # - name: Show ROCR environment - # run: | - # echo "ROCR: $ROCR_VISIBLE_DEVICES" - - # - name: Environment - # working-directory: /transformers - # run: | - # python3 utils/print_env.py - - # - name: Show installed libraries and their versions - # working-directory: /transformers - # run: pip freeze - - # - name: Run all tests on GPU - # working-directory: /transformers - # run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - # - name: Failure short reports - # if: ${{ failure() }} - # continue-on-error: true - # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - # - name: Test suite reports artifacts - # if: ${{ always() }} - # uses: actions/upload-artifact@v3 - # with: - # name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - # path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - # run_examples_gpu: - # name: Examples tests - # strategy: - # fail-fast: false - # matrix: - # machine_type: [single-gpu] - # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - # container: - # image: huggingface/transformers-pytorch-amd-gpu - # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - # needs: setup - # steps: - # - name: Update clone - # working-directory: /transformers - # run: git fetch && git checkout ${{ github.sha }} - - # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - # working-directory: /transformers - # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - # - name: ROCM-SMI - # run: | - # rocm-smi - # - name: ROCM-INFO - # run: | - # rocminfo | grep "Agent" -A 14 - # - name: Show ROCR environment - # run: | - # echo "ROCR: $ROCR_VISIBLE_DEVICES" - - # - name: Environment - # working-directory: /transformers - # run: | - # python3 utils/print_env.py - - # - name: Show installed libraries and their versions - # working-directory: /transformers - # run: pip freeze - - # - name: Run examples tests on GPU - # working-directory: /transformers - # run: | - # pip install -r examples/pytorch/_tests_requirements.txt - # python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch - - # - name: Failure short reports - # if: ${{ failure() }} - # continue-on-error: true - # run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt - - # - name: Test suite reports artifacts - # if: ${{ always() }} - # uses: actions/upload-artifact@v3 - # with: - # name: ${{ matrix.machine_type }}_run_examples_gpu - # path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu - - # run_pipelines_torch_gpu: - # name: PyTorch pipelines tests - # strategy: - # fail-fast: false - # matrix: - # machine_type: [single-gpu, multi-gpu] - # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - # container: - # image: huggingface/transformers-pytorch-amd-gpu - # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - # needs: setup - # steps: - # - name: Update clone - # working-directory: /transformers - # run: git fetch && git checkout ${{ github.sha }} - - # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - # working-directory: /transformers - # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - # - name: ROCM-SMI - # run: | - # rocm-smi - # - name: ROCM-INFO - # run: | - # rocminfo | grep "Agent" -A 14 - # - name: Show ROCR environment - # run: | - # echo "ROCR: $ROCR_VISIBLE_DEVICES" - - # - name: Environment - # working-directory: /transformers - # run: | - # python3 utils/print_env.py - - # - name: Show installed libraries and their versions - # working-directory: /transformers - # run: pip freeze - - # - name: Run all pipeline tests on GPU - # working-directory: /transformers - # run: | - # python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines - - # - name: Failure short reports - # if: ${{ failure() }} - # continue-on-error: true - # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt - - # - name: Test suite reports artifacts - # if: ${{ always() }} - # uses: actions/upload-artifact@v3 - # with: - # name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu - # path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu + run_tests_single_gpu: + name: Single GPU tests + strategy: + max-parallel: 1 # For now, not to parallelize. Can change later if it works well. + fail-fast: false + matrix: + folders: ${{ fromJson(needs.setup.outputs.matrix) }} + machine_type: [single-gpu] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /transformers + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + + run_tests_multi_gpu: + name: Multi GPU tests + strategy: + max-parallel: 1 + fail-fast: false + matrix: + folders: ${{ fromJson(needs.setup.outputs.matrix) }} + machine_type: [multi-gpu] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /transformers + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + + run_examples_gpu: + name: Examples tests + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run examples tests on GPU + working-directory: /transformers + run: | + pip install -r examples/pytorch/_tests_requirements.txt + python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_examples_gpu + path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu + + run_pipelines_torch_gpu: + name: PyTorch pipelines tests + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all pipeline tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu + path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu run_tests_torch_deepspeed_gpu: name: Torch ROCm deepspeed tests @@ -420,10 +420,10 @@ jobs: check_runner_status, check_runners, setup, - # run_tests_single_gpu, - # run_tests_multi_gpu, - # run_examples_gpu, - # run_pipelines_torch_gpu, + run_tests_single_gpu, + run_tests_multi_gpu, + run_examples_gpu, + run_pipelines_torch_gpu, run_tests_torch_deepspeed_gpu ] steps: @@ -469,10 +469,10 @@ jobs: check_runner_status, check_runners, setup, - # run_tests_single_gpu, - # run_tests_multi_gpu, - # run_examples_gpu, - # run_pipelines_torch_gpu, + run_tests_single_gpu, + run_tests_multi_gpu, + run_examples_gpu, + run_pipelines_torch_gpu, run_tests_torch_deepspeed_gpu, run_extract_warnings ] From 508ae294fdc9012ae7f87f9895c1a80c3ce876c5 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 30 Nov 2023 15:37:35 +0100 Subject: [PATCH 18/33] fix --- .github/workflows/self-scheduled-amd-caller.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/self-scheduled-amd-caller.yml b/.github/workflows/self-scheduled-amd-caller.yml index 4755bd868249ca..883477e458fd65 100644 --- a/.github/workflows/self-scheduled-amd-caller.yml +++ b/.github/workflows/self-scheduled-amd-caller.yml @@ -5,7 +5,7 @@ on: - cron: "17 2 * * *" push: branches: - - run_amd_scheduled_ci_caller* + - run_amd_scheduled_ci_caller__* jobs: run_amd_ci_mi210: From 09fee9eac76ba46427662d430b23c7c2cb344c4e Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 30 Nov 2023 15:57:12 +0100 Subject: [PATCH 19/33] comment until docker image build scheduled test fix --- .github/workflows/build-docker-images.yml | 68 +++++++++++------------ 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index eb7b172888580b..be070a95d3a94f 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -272,38 +272,38 @@ jobs: push: true tags: huggingface/transformers-tensorflow-gpu - latest-pytorch-deepspeed-amd: - name: "PyTorch + DeepSpeed (AMD) [dev]" + # latest-pytorch-deepspeed-amd: + # name: "PyTorch + DeepSpeed (AMD) [dev]" - runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] - steps: - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - name: Check out code - uses: actions/checkout@v3 - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-deepspeed-amd-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }} - # Push CI images still need to be re-built daily - - - name: Build and push (for Push CI) in a daily basis - # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - if: inputs.image_postfix != '-push-ci' - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-deepspeed-amd-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci + # runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] + # steps: + # - name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - name: Check out code + # uses: actions/checkout@v3 + # - name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-deepspeed-amd-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }} + # # Push CI images still need to be re-built daily + # - + # name: Build and push (for Push CI) in a daily basis + # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + # if: inputs.image_postfix != '-push-ci' + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-deepspeed-amd-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci From 407cfe972256448fae967fa4504483319a9a7696 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 30 Nov 2023 16:21:43 +0100 Subject: [PATCH 20/33] remove deprecated deepspeed build option --- .github/workflows/self-nightly-scheduled.yml | 4 ++-- .github/workflows/self-past.yml | 4 ++-- .github/workflows/self-push.yml | 4 ++-- .github/workflows/self-scheduled.yml | 2 +- docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml index e4b4f7f77cf077..37dc98f340a16d 100644 --- a/.github/workflows/self-nightly-scheduled.yml +++ b/.github/workflows/self-nightly-scheduled.yml @@ -212,7 +212,7 @@ jobs: python3 -m pip uninstall -y deepspeed rm -rf DeepSpeed git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | @@ -286,4 +286,4 @@ jobs: with: name: | single-* - multi-* \ No newline at end of file + multi-* diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml index 6a154544df8b97..d0ce313bd48e0a 100644 --- a/.github/workflows/self-past.yml +++ b/.github/workflows/self-past.yml @@ -255,7 +255,7 @@ jobs: python3 -m pip uninstall -y deepspeed rm -rf DeepSpeed git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | @@ -341,4 +341,4 @@ jobs: with: name: | single-* - multi-* \ No newline at end of file + multi-* diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index a6ea5b1e04b942..e6f1f3b3050f7a 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -366,7 +366,7 @@ jobs: working-directory: /workspace run: | python3 -m pip uninstall -y deepspeed - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | @@ -456,7 +456,7 @@ jobs: working-directory: /workspace run: | python3 -m pip uninstall -y deepspeed - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 4a04cb14ac7bb3..bf7f579a9e3ad2 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -366,7 +366,7 @@ jobs: working-directory: /workspace run: | python3 -m pip uninstall -y deepspeed - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile index 184639552eee7a..a8789f9170e721 100644 --- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile @@ -44,7 +44,7 @@ RUN python3 -m pip uninstall -y deepspeed # This has to be run (again) inside the GPU VMs running the tests. # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests. # TODO: Find out why test fail. -RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 +RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. From f846b80bed3b75471edfde5ae3b617d0b235765a Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 30 Nov 2023 18:20:53 +0100 Subject: [PATCH 21/33] upgrade torch --- .../Dockerfile | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile index 5090b14f9c283d..817abd42df7baa 100644 --- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile @@ -1,9 +1,11 @@ -FROM rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1 +FROM rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive -ARG PYTORCH='2.0.1' -ARG ROCM='5.7' +ARG PYTORCH='2.1.0' +ARG TORCH_VISION='0.16.0' +ARG TORCH_AUDIO='2.1.0' +ARG ROCM='5.6' RUN apt update && \ apt install -y --no-install-recommends libaio-dev git && \ @@ -14,6 +16,10 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip RUN python3 -m pip uninstall -y apex +RUN python3 -m pip uninstall -y torch torchvision torchaudio + +RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM + ARG REF=main WORKDIR / RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF From 785b63aed48b6b15b538aa49c71327cd7fb98d50 Mon Sep 17 00:00:00 2001 From: Felix Marty Date: Mon, 4 Dec 2023 15:30:47 +0000 Subject: [PATCH 22/33] update docker & make tests pass --- .../transformers-pytorch-amd-gpu/Dockerfile | 4 +++ .../Dockerfile | 32 ++++++++++++------- tests/deepspeed/test_deepspeed.py | 4 +-- 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/docker/transformers-pytorch-amd-gpu/Dockerfile b/docker/transformers-pytorch-amd-gpu/Dockerfile index 216ff4c4385548..46ca1a531b4ab4 100644 --- a/docker/transformers-pytorch-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-amd-gpu/Dockerfile @@ -22,7 +22,11 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools ninja git+htt ARG REF=main WORKDIR / + +# Invalidate docker cache from here if new commit is available. +ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF + RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video] RUN python3 -m pip uninstall -y tensorflow flax diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile index 817abd42df7baa..e426b3ee75809d 100644 --- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile @@ -1,32 +1,40 @@ -FROM rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1 +FROM rocm/dev-ubuntu-22.04:5.6 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive -ARG PYTORCH='2.1.0' -ARG TORCH_VISION='0.16.0' -ARG TORCH_AUDIO='2.1.0' +ARG PYTORCH='2.1.1' +ARG TORCH_VISION='0.16.1' +ARG TORCH_AUDIO='2.1.1' ARG ROCM='5.6' RUN apt update && \ - apt install -y --no-install-recommends libaio-dev git && \ + apt install -y --no-install-recommends \ + libaio-dev \ + git \ + # These are required to build deepspeed. + python3-dev \ + rocrand-dev \ + rocthrust-dev \ + hipsparse-dev \ + hipblas-dev && \ apt clean && \ rm -rf /var/lib/apt/lists/* RUN python3 -m pip install --no-cache-dir --upgrade pip - -RUN python3 -m pip uninstall -y apex - -RUN python3 -m pip uninstall -y torch torchvision torchaudio - -RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM +RUN python3 -m pip uninstall -y apex torch torchvision torchaudio +RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir ARG REF=main WORKDIR / + +# Invalidate docker cache from here if new commit is available. +ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF + RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. RUN cd transformers && python3 setup.py develop -RUN python3 -c "from deepspeed.launcher.runner import main" \ No newline at end of file +RUN python3 -c "from deepspeed.launcher.runner import main" diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 2352cf522f29a7..14c8f6703166c9 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -561,8 +561,8 @@ def test_gradient_accumulation(self, stage, dtype): self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5) self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5) - # see the note above how to get identical loss on a small bs - self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=2) + # Relative difference. See the note above how to get identical loss on a small bs + self.assertTrue((no_grad_accum_loss - yes_grad_accum_loss) / (no_grad_accum_loss + 1e-15) <= 1e-3) def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage, dtype): # adapted from TrainerIntegrationCommon.check_saved_checkpoints From f0f931e2fcf5bdfc1c510a53d9787f339785e683 Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Tue, 5 Dec 2023 19:34:58 +0900 Subject: [PATCH 23/33] Update docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile --- docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile index e426b3ee75809d..36a2d8fafd8eb4 100644 --- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile @@ -16,7 +16,8 @@ RUN apt update && \ rocrand-dev \ rocthrust-dev \ hipsparse-dev \ - hipblas-dev && \ + hipblas-dev \ + rocblas-dev && \ apt clean && \ rm -rf /var/lib/apt/lists/* From 40398b9a0f4bb08ea2829e1fac8bb44cb5b9812b Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 5 Dec 2023 11:47:45 +0100 Subject: [PATCH 24/33] fix --- .github/workflows/self-scheduled-amd-caller.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/self-scheduled-amd-caller.yml b/.github/workflows/self-scheduled-amd-caller.yml index 14d46f453c31d6..fb02d2742d16dc 100644 --- a/.github/workflows/self-scheduled-amd-caller.yml +++ b/.github/workflows/self-scheduled-amd-caller.yml @@ -3,9 +3,6 @@ name: Self-hosted runner (AMD scheduled CI caller) on: schedule: - cron: "17 2 * * *" - push: - branches: - - run_amd_scheduled_ci_caller* jobs: run_scheduled_amd_ci: @@ -14,4 +11,4 @@ jobs: if: ${{ always() }} steps: - name: Trigger scheduled AMD CI via workflow_run - run: echo "Trigger scheduled AMD CI via workflow_run" + run: echo "Trigger scheduled AMD CI via workflow_run" \ No newline at end of file From 3332cd2eb55005b597708805699285948d0a2f3e Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 5 Dec 2023 11:47:57 +0100 Subject: [PATCH 25/33] tmp disable test --- .github/workflows/self-scheduled-amd-mi210-caller.yml | 2 +- .github/workflows/self-scheduled-amd-mi250-caller.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/self-scheduled-amd-mi210-caller.yml b/.github/workflows/self-scheduled-amd-mi210-caller.yml index ceaba454ae642e..68294abfa2a730 100644 --- a/.github/workflows/self-scheduled-amd-mi210-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml @@ -7,7 +7,7 @@ on: types: [completed] push: branches: - - run_amd_scheduled_ci_caller* + - run_amd_scheduled_ci_caller__* jobs: run_amd_ci: diff --git a/.github/workflows/self-scheduled-amd-mi250-caller.yml b/.github/workflows/self-scheduled-amd-mi250-caller.yml index 843e3476342e9d..748c6ec0d18711 100644 --- a/.github/workflows/self-scheduled-amd-mi250-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml @@ -7,7 +7,7 @@ on: types: [completed] push: branches: - - run_amd_scheduled_ci_caller* + - run_amd_scheduled_ci_caller__* jobs: run_amd_ci: From 9696cc4ef7b8859b830b9e4cb1be1cf881945bcd Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 5 Dec 2023 14:28:28 +0100 Subject: [PATCH 26/33] precompile deepspeed to avoid timeout during tests --- docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile index 36a2d8fafd8eb4..466af2fcbf889f 100644 --- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile @@ -21,10 +21,13 @@ RUN apt update && \ apt clean && \ rm -rf /var/lib/apt/lists/* -RUN python3 -m pip install --no-cache-dir --upgrade pip +RUN python3 -m pip install --no-cache-dir --upgrade pip ninja "pydantic<2" RUN python3 -m pip uninstall -y apex torch torchvision torchaudio RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir +# Pre-build **latest** DeepSpeed, so it's be ready for testing (to avoid timeout) +RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1 + ARG REF=main WORKDIR / @@ -32,10 +35,10 @@ WORKDIR / ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF -RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] +RUN python3 -m pip install --no-cache-dir ./transformers[accelerate,testing,sentencepiece] # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. RUN cd transformers && python3 setup.py develop -RUN python3 -c "from deepspeed.launcher.runner import main" +RUN python3 -c "from deepspeed.launcher.runner import main" \ No newline at end of file From 84a7a3398de8c83c66f7d5ca2600d65407dd5715 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 5 Dec 2023 16:13:11 +0100 Subject: [PATCH 27/33] fix comment --- docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile index 466af2fcbf889f..e309d555306bdb 100644 --- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile @@ -25,7 +25,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip ninja "pydantic<2" RUN python3 -m pip uninstall -y apex torch torchvision torchaudio RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir -# Pre-build **latest** DeepSpeed, so it's be ready for testing (to avoid timeout) +# Pre-build DeepSpeed, so it's be ready for testing (to avoid timeout) RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1 ARG REF=main From df00cff6caa4722a26f1de9c3219ce8e28c7c5c0 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 5 Dec 2023 17:56:47 +0100 Subject: [PATCH 28/33] trigger deepspeed tests with new image --- .../self-scheduled-amd-mi210-caller.yml | 2 +- .../self-scheduled-amd-mi250-caller.yml | 2 +- .github/workflows/self-scheduled-amd.yml | 498 +++++++++--------- 3 files changed, 251 insertions(+), 251 deletions(-) diff --git a/.github/workflows/self-scheduled-amd-mi210-caller.yml b/.github/workflows/self-scheduled-amd-mi210-caller.yml index 68294abfa2a730..ceaba454ae642e 100644 --- a/.github/workflows/self-scheduled-amd-mi210-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml @@ -7,7 +7,7 @@ on: types: [completed] push: branches: - - run_amd_scheduled_ci_caller__* + - run_amd_scheduled_ci_caller* jobs: run_amd_ci: diff --git a/.github/workflows/self-scheduled-amd-mi250-caller.yml b/.github/workflows/self-scheduled-amd-mi250-caller.yml index 748c6ec0d18711..843e3476342e9d 100644 --- a/.github/workflows/self-scheduled-amd-mi250-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml @@ -7,7 +7,7 @@ on: types: [completed] push: branches: - - run_amd_scheduled_ci_caller__* + - run_amd_scheduled_ci_caller* jobs: run_amd_ci: diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 997b5ed4ee16eb..70f844da792039 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -107,254 +107,254 @@ jobs: run: | python3 utils/print_env.py - run_tests_single_gpu: - name: Single GPU tests - strategy: - max-parallel: 1 # For now, not to parallelize. Can change later if it works well. - fail-fast: false - matrix: - folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - run_tests_multi_gpu: - name: Multi GPU tests - strategy: - max-parallel: 1 - fail-fast: false - matrix: - folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - run_examples_gpu: - name: Examples tests - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run examples tests on GPU - working-directory: /transformers - run: | - pip install -r examples/pytorch/_tests_requirements.txt - python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_examples_gpu - path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu - - run_pipelines_torch_gpu: - name: PyTorch pipelines tests - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all pipeline tests on GPU - working-directory: /transformers - run: | - python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu - path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu + # run_tests_single_gpu: + # name: Single GPU tests + # strategy: + # max-parallel: 1 # For now, not to parallelize. Can change later if it works well. + # fail-fast: false + # matrix: + # folders: ${{ fromJson(needs.setup.outputs.matrix) }} + # machine_type: [single-gpu] + # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + # container: + # image: huggingface/transformers-pytorch-amd-gpu + # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + # needs: setup + # steps: + # - name: Echo folder ${{ matrix.folders }} + # shell: bash + # # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # # set the artifact folder names (because the character `/` is not allowed). + # run: | + # echo "${{ matrix.folders }}" + # matrix_folders=${{ matrix.folders }} + # matrix_folders=${matrix_folders/'models/'/'models_'} + # echo "$matrix_folders" + # echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + # - name: Update clone + # working-directory: /transformers + # run: git fetch && git checkout ${{ github.sha }} + + # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + # working-directory: /transformers + # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + # - name: ROCM-SMI + # run: | + # rocm-smi + # - name: ROCM-INFO + # run: | + # rocminfo | grep "Agent" -A 14 + # - name: Show ROCR environment + # run: | + # echo "ROCR: $ROCR_VISIBLE_DEVICES" + + # - name: Environment + # working-directory: /transformers + # run: | + # python3 utils/print_env.py + + # - name: Show installed libraries and their versions + # working-directory: /transformers + # run: pip freeze + + # - name: Run all tests on GPU + # working-directory: /transformers + # run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + + # - name: Failure short reports + # if: ${{ failure() }} + # continue-on-error: true + # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + # - name: Test suite reports artifacts + # if: ${{ always() }} + # uses: actions/upload-artifact@v3 + # with: + # name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + # path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + + # run_tests_multi_gpu: + # name: Multi GPU tests + # strategy: + # max-parallel: 1 + # fail-fast: false + # matrix: + # folders: ${{ fromJson(needs.setup.outputs.matrix) }} + # machine_type: [multi-gpu] + # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + # container: + # image: huggingface/transformers-pytorch-amd-gpu + # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + # needs: setup + # steps: + # - name: Echo folder ${{ matrix.folders }} + # shell: bash + # # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # # set the artifact folder names (because the character `/` is not allowed). + # run: | + # echo "${{ matrix.folders }}" + # matrix_folders=${{ matrix.folders }} + # matrix_folders=${matrix_folders/'models/'/'models_'} + # echo "$matrix_folders" + # echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + # - name: Update clone + # working-directory: /transformers + # run: git fetch && git checkout ${{ github.sha }} + + # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + # working-directory: /transformers + # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + # - name: ROCM-SMI + # run: | + # rocm-smi + # - name: ROCM-INFO + # run: | + # rocminfo | grep "Agent" -A 14 + # - name: Show ROCR environment + # run: | + # echo "ROCR: $ROCR_VISIBLE_DEVICES" + + # - name: Environment + # working-directory: /transformers + # run: | + # python3 utils/print_env.py + + # - name: Show installed libraries and their versions + # working-directory: /transformers + # run: pip freeze + + # - name: Run all tests on GPU + # working-directory: /transformers + # run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + + # - name: Failure short reports + # if: ${{ failure() }} + # continue-on-error: true + # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + # - name: Test suite reports artifacts + # if: ${{ always() }} + # uses: actions/upload-artifact@v3 + # with: + # name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + # path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + + # run_examples_gpu: + # name: Examples tests + # strategy: + # fail-fast: false + # matrix: + # machine_type: [single-gpu] + # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + # container: + # image: huggingface/transformers-pytorch-amd-gpu + # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + # needs: setup + # steps: + # - name: Update clone + # working-directory: /transformers + # run: git fetch && git checkout ${{ github.sha }} + + # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + # working-directory: /transformers + # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + # - name: ROCM-SMI + # run: | + # rocm-smi + # - name: ROCM-INFO + # run: | + # rocminfo | grep "Agent" -A 14 + # - name: Show ROCR environment + # run: | + # echo "ROCR: $ROCR_VISIBLE_DEVICES" + + # - name: Environment + # working-directory: /transformers + # run: | + # python3 utils/print_env.py + + # - name: Show installed libraries and their versions + # working-directory: /transformers + # run: pip freeze + + # - name: Run examples tests on GPU + # working-directory: /transformers + # run: | + # pip install -r examples/pytorch/_tests_requirements.txt + # python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch + + # - name: Failure short reports + # if: ${{ failure() }} + # continue-on-error: true + # run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt + + # - name: Test suite reports artifacts + # if: ${{ always() }} + # uses: actions/upload-artifact@v3 + # with: + # name: ${{ matrix.machine_type }}_run_examples_gpu + # path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu + + # run_pipelines_torch_gpu: + # name: PyTorch pipelines tests + # strategy: + # fail-fast: false + # matrix: + # machine_type: [single-gpu, multi-gpu] + # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + # container: + # image: huggingface/transformers-pytorch-amd-gpu + # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + # needs: setup + # steps: + # - name: Update clone + # working-directory: /transformers + # run: git fetch && git checkout ${{ github.sha }} + + # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + # working-directory: /transformers + # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + # - name: ROCM-SMI + # run: | + # rocm-smi + # - name: ROCM-INFO + # run: | + # rocminfo | grep "Agent" -A 14 + # - name: Show ROCR environment + # run: | + # echo "ROCR: $ROCR_VISIBLE_DEVICES" + + # - name: Environment + # working-directory: /transformers + # run: | + # python3 utils/print_env.py + + # - name: Show installed libraries and their versions + # working-directory: /transformers + # run: pip freeze + + # - name: Run all pipeline tests on GPU + # working-directory: /transformers + # run: | + # python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines + + # - name: Failure short reports + # if: ${{ failure() }} + # continue-on-error: true + # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt + + # - name: Test suite reports artifacts + # if: ${{ always() }} + # uses: actions/upload-artifact@v3 + # with: + # name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu + # path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu run_tests_torch_deepspeed_gpu: name: Torch ROCm deepspeed tests @@ -366,7 +366,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] needs: setup container: - image: huggingface/transformers-pytorch-deepspeed-amd-gpu + image: echarlaix/amd-deepspeed-test # TODO: replace with huggingface/transformers-pytorch-deepspeed-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Update clone From 92c402d9e48ae6f4b68caab8115259b42ad70cd5 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 5 Dec 2023 18:04:02 +0100 Subject: [PATCH 29/33] comment tests --- .github/workflows/self-scheduled-amd-caller.yml | 2 +- .github/workflows/self-scheduled-amd.yml | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/self-scheduled-amd-caller.yml b/.github/workflows/self-scheduled-amd-caller.yml index fb02d2742d16dc..dc5c7b7e905bd8 100644 --- a/.github/workflows/self-scheduled-amd-caller.yml +++ b/.github/workflows/self-scheduled-amd-caller.yml @@ -11,4 +11,4 @@ jobs: if: ${{ always() }} steps: - name: Trigger scheduled AMD CI via workflow_run - run: echo "Trigger scheduled AMD CI via workflow_run" \ No newline at end of file + run: echo "Trigger scheduled AMD CI via workflow_run" diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 70f844da792039..1ac5ef980c95e2 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -420,10 +420,10 @@ jobs: check_runner_status, check_runners, setup, - run_tests_single_gpu, - run_tests_multi_gpu, - run_examples_gpu, - run_pipelines_torch_gpu, + # run_tests_single_gpu, + # run_tests_multi_gpu, + # run_examples_gpu, + # run_pipelines_torch_gpu, run_tests_torch_deepspeed_gpu ] steps: @@ -469,10 +469,10 @@ jobs: check_runner_status, check_runners, setup, - run_tests_single_gpu, - run_tests_multi_gpu, - run_examples_gpu, - run_pipelines_torch_gpu, + # run_tests_single_gpu, + # run_tests_multi_gpu, + # run_examples_gpu, + # run_pipelines_torch_gpu, run_tests_torch_deepspeed_gpu, run_extract_warnings ] From fa82a9c747bbc2acc1b2f00fc6092159d8fdfbb2 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Wed, 6 Dec 2023 10:42:57 +0100 Subject: [PATCH 30/33] trigger From ecb92392c6b7f10acb27d7691972c7191fc3962b Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Wed, 6 Dec 2023 12:53:01 +0100 Subject: [PATCH 31/33] add sklearn dependency to fix slow tests --- docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile index e309d555306bdb..1fa384dfa2bc03 100644 --- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile @@ -13,6 +13,7 @@ RUN apt update && \ git \ # These are required to build deepspeed. python3-dev \ + python-is-python3 \ rocrand-dev \ rocthrust-dev \ hipsparse-dev \ @@ -35,7 +36,7 @@ WORKDIR / ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF -RUN python3 -m pip install --no-cache-dir ./transformers[accelerate,testing,sentencepiece] +RUN python3 -m pip install --no-cache-dir ./transformers[accelerate,testing,sentencepiece,sklearn] # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. From cfcc312b4e83c7ab665032d40cfca7ed9c02e443 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Wed, 6 Dec 2023 15:59:49 +0100 Subject: [PATCH 32/33] enable back other tests --- .../self-scheduled-amd-mi210-caller.yml | 2 +- .../self-scheduled-amd-mi250-caller.yml | 2 +- .github/workflows/self-scheduled-amd.yml | 512 +++++++++--------- 3 files changed, 258 insertions(+), 258 deletions(-) diff --git a/.github/workflows/self-scheduled-amd-mi210-caller.yml b/.github/workflows/self-scheduled-amd-mi210-caller.yml index ceaba454ae642e..68294abfa2a730 100644 --- a/.github/workflows/self-scheduled-amd-mi210-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml @@ -7,7 +7,7 @@ on: types: [completed] push: branches: - - run_amd_scheduled_ci_caller* + - run_amd_scheduled_ci_caller__* jobs: run_amd_ci: diff --git a/.github/workflows/self-scheduled-amd-mi250-caller.yml b/.github/workflows/self-scheduled-amd-mi250-caller.yml index 843e3476342e9d..748c6ec0d18711 100644 --- a/.github/workflows/self-scheduled-amd-mi250-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml @@ -7,7 +7,7 @@ on: types: [completed] push: branches: - - run_amd_scheduled_ci_caller* + - run_amd_scheduled_ci_caller__* jobs: run_amd_ci: diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 1ac5ef980c95e2..43d59b90e3b152 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -107,254 +107,254 @@ jobs: run: | python3 utils/print_env.py - # run_tests_single_gpu: - # name: Single GPU tests - # strategy: - # max-parallel: 1 # For now, not to parallelize. Can change later if it works well. - # fail-fast: false - # matrix: - # folders: ${{ fromJson(needs.setup.outputs.matrix) }} - # machine_type: [single-gpu] - # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - # container: - # image: huggingface/transformers-pytorch-amd-gpu - # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - # needs: setup - # steps: - # - name: Echo folder ${{ matrix.folders }} - # shell: bash - # # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # # set the artifact folder names (because the character `/` is not allowed). - # run: | - # echo "${{ matrix.folders }}" - # matrix_folders=${{ matrix.folders }} - # matrix_folders=${matrix_folders/'models/'/'models_'} - # echo "$matrix_folders" - # echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - # - name: Update clone - # working-directory: /transformers - # run: git fetch && git checkout ${{ github.sha }} - - # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - # working-directory: /transformers - # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - # - name: ROCM-SMI - # run: | - # rocm-smi - # - name: ROCM-INFO - # run: | - # rocminfo | grep "Agent" -A 14 - # - name: Show ROCR environment - # run: | - # echo "ROCR: $ROCR_VISIBLE_DEVICES" - - # - name: Environment - # working-directory: /transformers - # run: | - # python3 utils/print_env.py - - # - name: Show installed libraries and their versions - # working-directory: /transformers - # run: pip freeze - - # - name: Run all tests on GPU - # working-directory: /transformers - # run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - # - name: Failure short reports - # if: ${{ failure() }} - # continue-on-error: true - # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - # - name: Test suite reports artifacts - # if: ${{ always() }} - # uses: actions/upload-artifact@v3 - # with: - # name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - # path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - # run_tests_multi_gpu: - # name: Multi GPU tests - # strategy: - # max-parallel: 1 - # fail-fast: false - # matrix: - # folders: ${{ fromJson(needs.setup.outputs.matrix) }} - # machine_type: [multi-gpu] - # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - # container: - # image: huggingface/transformers-pytorch-amd-gpu - # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - # needs: setup - # steps: - # - name: Echo folder ${{ matrix.folders }} - # shell: bash - # # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # # set the artifact folder names (because the character `/` is not allowed). - # run: | - # echo "${{ matrix.folders }}" - # matrix_folders=${{ matrix.folders }} - # matrix_folders=${matrix_folders/'models/'/'models_'} - # echo "$matrix_folders" - # echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - # - name: Update clone - # working-directory: /transformers - # run: git fetch && git checkout ${{ github.sha }} - - # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - # working-directory: /transformers - # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - # - name: ROCM-SMI - # run: | - # rocm-smi - # - name: ROCM-INFO - # run: | - # rocminfo | grep "Agent" -A 14 - # - name: Show ROCR environment - # run: | - # echo "ROCR: $ROCR_VISIBLE_DEVICES" - - # - name: Environment - # working-directory: /transformers - # run: | - # python3 utils/print_env.py - - # - name: Show installed libraries and their versions - # working-directory: /transformers - # run: pip freeze - - # - name: Run all tests on GPU - # working-directory: /transformers - # run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - # - name: Failure short reports - # if: ${{ failure() }} - # continue-on-error: true - # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - # - name: Test suite reports artifacts - # if: ${{ always() }} - # uses: actions/upload-artifact@v3 - # with: - # name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - # path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - # run_examples_gpu: - # name: Examples tests - # strategy: - # fail-fast: false - # matrix: - # machine_type: [single-gpu] - # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - # container: - # image: huggingface/transformers-pytorch-amd-gpu - # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - # needs: setup - # steps: - # - name: Update clone - # working-directory: /transformers - # run: git fetch && git checkout ${{ github.sha }} - - # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - # working-directory: /transformers - # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - # - name: ROCM-SMI - # run: | - # rocm-smi - # - name: ROCM-INFO - # run: | - # rocminfo | grep "Agent" -A 14 - # - name: Show ROCR environment - # run: | - # echo "ROCR: $ROCR_VISIBLE_DEVICES" - - # - name: Environment - # working-directory: /transformers - # run: | - # python3 utils/print_env.py - - # - name: Show installed libraries and their versions - # working-directory: /transformers - # run: pip freeze - - # - name: Run examples tests on GPU - # working-directory: /transformers - # run: | - # pip install -r examples/pytorch/_tests_requirements.txt - # python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch - - # - name: Failure short reports - # if: ${{ failure() }} - # continue-on-error: true - # run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt - - # - name: Test suite reports artifacts - # if: ${{ always() }} - # uses: actions/upload-artifact@v3 - # with: - # name: ${{ matrix.machine_type }}_run_examples_gpu - # path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu - - # run_pipelines_torch_gpu: - # name: PyTorch pipelines tests - # strategy: - # fail-fast: false - # matrix: - # machine_type: [single-gpu, multi-gpu] - # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - # container: - # image: huggingface/transformers-pytorch-amd-gpu - # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - # needs: setup - # steps: - # - name: Update clone - # working-directory: /transformers - # run: git fetch && git checkout ${{ github.sha }} - - # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - # working-directory: /transformers - # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - # - name: ROCM-SMI - # run: | - # rocm-smi - # - name: ROCM-INFO - # run: | - # rocminfo | grep "Agent" -A 14 - # - name: Show ROCR environment - # run: | - # echo "ROCR: $ROCR_VISIBLE_DEVICES" - - # - name: Environment - # working-directory: /transformers - # run: | - # python3 utils/print_env.py - - # - name: Show installed libraries and their versions - # working-directory: /transformers - # run: pip freeze - - # - name: Run all pipeline tests on GPU - # working-directory: /transformers - # run: | - # python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines - - # - name: Failure short reports - # if: ${{ failure() }} - # continue-on-error: true - # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt - - # - name: Test suite reports artifacts - # if: ${{ always() }} - # uses: actions/upload-artifact@v3 - # with: - # name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu - # path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu + run_tests_single_gpu: + name: Single GPU tests + strategy: + max-parallel: 1 # For now, not to parallelize. Can change later if it works well. + fail-fast: false + matrix: + folders: ${{ fromJson(needs.setup.outputs.matrix) }} + machine_type: [single-gpu] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /transformers + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + + run_tests_multi_gpu: + name: Multi GPU tests + strategy: + max-parallel: 1 + fail-fast: false + matrix: + folders: ${{ fromJson(needs.setup.outputs.matrix) }} + machine_type: [multi-gpu] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /transformers + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + + run_examples_gpu: + name: Examples tests + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run examples tests on GPU + working-directory: /transformers + run: | + pip install -r examples/pytorch/_tests_requirements.txt + python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_examples_gpu + path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu + + run_pipelines_torch_gpu: + name: PyTorch pipelines tests + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all pipeline tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu + path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu run_tests_torch_deepspeed_gpu: name: Torch ROCm deepspeed tests @@ -420,10 +420,10 @@ jobs: check_runner_status, check_runners, setup, - # run_tests_single_gpu, - # run_tests_multi_gpu, - # run_examples_gpu, - # run_pipelines_torch_gpu, + run_tests_single_gpu, + run_tests_multi_gpu, + run_examples_gpu, + run_pipelines_torch_gpu, run_tests_torch_deepspeed_gpu ] steps: @@ -469,10 +469,10 @@ jobs: check_runner_status, check_runners, setup, - # run_tests_single_gpu, - # run_tests_multi_gpu, - # run_examples_gpu, - # run_pipelines_torch_gpu, + run_tests_single_gpu, + run_tests_multi_gpu, + run_examples_gpu, + run_pipelines_torch_gpu, run_tests_torch_deepspeed_gpu, run_extract_warnings ] From ae82b3fd526fe10a9cab96df164e9fd8cc9d0b15 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Thu, 7 Dec 2023 15:43:15 +0100 Subject: [PATCH 33/33] final update --- .github/workflows/self-scheduled-amd-mi210-caller.yml | 2 +- .github/workflows/self-scheduled-amd-mi250-caller.yml | 2 +- .github/workflows/self-scheduled-amd.yml | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/self-scheduled-amd-mi210-caller.yml b/.github/workflows/self-scheduled-amd-mi210-caller.yml index 68294abfa2a730..ceaba454ae642e 100644 --- a/.github/workflows/self-scheduled-amd-mi210-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml @@ -7,7 +7,7 @@ on: types: [completed] push: branches: - - run_amd_scheduled_ci_caller__* + - run_amd_scheduled_ci_caller* jobs: run_amd_ci: diff --git a/.github/workflows/self-scheduled-amd-mi250-caller.yml b/.github/workflows/self-scheduled-amd-mi250-caller.yml index 748c6ec0d18711..843e3476342e9d 100644 --- a/.github/workflows/self-scheduled-amd-mi250-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml @@ -7,7 +7,7 @@ on: types: [completed] push: branches: - - run_amd_scheduled_ci_caller__* + - run_amd_scheduled_ci_caller* jobs: run_amd_ci: diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 43d59b90e3b152..3d41a3b95e6c50 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -366,7 +366,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] needs: setup container: - image: echarlaix/amd-deepspeed-test # TODO: replace with huggingface/transformers-pytorch-deepspeed-amd-gpu + image: huggingface/transformers-pytorch-deepspeed-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Update clone @@ -383,6 +383,7 @@ jobs: - name: ROCM-INFO run: | rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment run: | echo "ROCR: $ROCR_VISIBLE_DEVICES"