From 2cfb53d1d5486e7e841e6460b3a4208d7a6a97ec Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 23 Nov 2023 17:55:33 +0100 Subject: [PATCH] add dockerfile --- .github/workflows/build-docker-images.yml | 493 +++++++++-------- .github/workflows/self-scheduled-amd.yml | 496 +++++++++--------- .../Dockerfile | 19 + 3 files changed, 531 insertions(+), 477 deletions(-) create mode 100644 docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index b267ad7882d89f..6c567566d4d64d 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -4,6 +4,7 @@ on: push: branches: - build_ci_docker_image* + # - run_amd_scheduled_ci_caller_deepspeed_test repository_dispatch: workflow_call: inputs: @@ -18,118 +19,280 @@ concurrency: cancel-in-progress: false jobs: - latest-docker: - name: "Latest PyTorch + TensorFlow [dev]" - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-all-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} - # Push CI images still need to be re-built daily - - - name: Build and push (for Push CI) in a daily basis - # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - if: inputs.image_postfix != '-push-ci' - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-all-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-all-latest-gpu-push-ci + # latest-docker: + # name: "Latest PyTorch + TensorFlow [dev]" + # runs-on: ubuntu-22.04 + # steps: + # - name: Cleanup disk + # run: | + # sudo ls -l /usr/local/lib/ + # sudo ls -l /usr/share/ + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # sudo rm -rf /usr/local/lib/android + # sudo rm -rf /usr/share/dotnet + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-all-latest-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} + # # Push CI images still need to be re-built daily + # - + # name: Build and push (for Push CI) in a daily basis + # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + # if: inputs.image_postfix != '-push-ci' + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-all-latest-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-all-latest-gpu-push-ci + + # latest-torch-deepspeed-docker: + # name: "Latest PyTorch + DeepSpeed" + # runs-on: ubuntu-22.04 + # steps: + # - name: Cleanup disk + # run: | + # sudo ls -l /usr/local/lib/ + # sudo ls -l /usr/share/ + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # sudo rm -rf /usr/local/lib/android + # sudo rm -rf /usr/share/dotnet + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-deepspeed-latest-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} + + # # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) + # latest-torch-deepspeed-docker-for-push-ci-daily-build: + # name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" + # runs-on: ubuntu-22.04 + # steps: + # - name: Cleanup disk + # run: | + # sudo ls -l /usr/local/lib/ + # sudo ls -l /usr/share/ + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # sudo rm -rf /usr/local/lib/android + # sudo rm -rf /usr/share/dotnet + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # # Push CI images still need to be re-built daily + # - + # name: Build and push (for Push CI) in a daily basis + # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + # if: inputs.image_postfix != '-push-ci' + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-deepspeed-latest-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci + + # doc-builder: + # name: "Doc builder" + # # Push CI doesn't need this image + # if: inputs.image_postfix != '-push-ci' + # runs-on: ubuntu-22.04 + # steps: + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-doc-builder + # push: true + # tags: huggingface/transformers-doc-builder + + # latest-pytorch: + # name: "Latest PyTorch [dev]" + # # Push CI doesn't need this image + # if: inputs.image_postfix != '-push-ci' + # runs-on: ubuntu-22.04 + # steps: + # - name: Cleanup disk + # run: | + # sudo ls -l /usr/local/lib/ + # sudo ls -l /usr/share/ + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # sudo rm -rf /usr/local/lib/android + # sudo rm -rf /usr/share/dotnet + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-gpu + + # latest-pytorch-amd: + # name: "Latest PyTorch (AMD) [dev]" + # runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] + # steps: + # - name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - name: Check out code + # uses: actions/checkout@v3 + # - name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-amd-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }} + # # Push CI images still need to be re-built daily + # - name: Build and push (for Push CI) in a daily basis + # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + # if: inputs.image_postfix != '-push-ci' + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-amd-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-amd-gpu-push-ci - latest-torch-deepspeed-docker: - name: "Latest PyTorch + DeepSpeed" - runs-on: ubuntu-22.04 + # latest-tensorflow: + # name: "Latest TensorFlow [dev]" + # # Push CI doesn't need this image + # if: inputs.image_postfix != '-push-ci' + # runs-on: ubuntu-22.04 + # steps: + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-tensorflow-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-tensorflow-gpu + + latest-pytorch-deepspeed-amd: + name: "PyTorch + DeepSpeed (AMD)" + + runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - - - name: Check out code + - name: Check out code uses: actions/checkout@v3 - - - name: Login to DockerHub + - name: Login to DockerHub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push + - name: Build and push uses: docker/build-push-action@v5 with: - context: ./docker/transformers-pytorch-deepspeed-latest-gpu + context: ./docker/transformers-pytorch-deepspeed-amd-gpu build-args: | REF=main push: true - tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} - - # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) - latest-torch-deepspeed-docker-for-push-ci-daily-build: - name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} + tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}-test # Push CI images still need to be re-built daily - name: Build and push (for Push CI) in a daily basis @@ -138,136 +301,8 @@ jobs: if: inputs.image_postfix != '-push-ci' uses: docker/build-push-action@v5 with: - context: ./docker/transformers-pytorch-deepspeed-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci - - doc-builder: - name: "Doc builder" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-22.04 - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-doc-builder - push: true - tags: huggingface/transformers-doc-builder - - latest-pytorch: - name: "Latest PyTorch [dev]" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-gpu - -# Need to be fixed with the help from Guillaume. -# latest-pytorch-amd: -# name: "Latest PyTorch (AMD) [dev]" -# runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] -# steps: -# - name: Set up Docker Buildx -# uses: docker/setup-buildx-action@v3 -# - name: Check out code -# uses: actions/checkout@v3 -# - name: Login to DockerHub -# uses: docker/login-action@v3 -# with: -# username: ${{ secrets.DOCKERHUB_USERNAME }} -# password: ${{ secrets.DOCKERHUB_PASSWORD }} -# - name: Build and push -# uses: docker/build-push-action@v5 -# with: -# context: ./docker/transformers-pytorch-amd-gpu -# build-args: | -# REF=main -# push: true -# tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }} -# # Push CI images still need to be re-built daily -# - -# name: Build and push (for Push CI) in a daily basis -# # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. -# # The later case is useful for manual image building for debugging purpose. Use another tag in this case! -# if: inputs.image_postfix != '-push-ci' -# uses: docker/build-push-action@v5 -# with: -# context: ./docker/transformers-pytorch-amd-gpu -# build-args: | -# REF=main -# push: true -# tags: huggingface/transformers-pytorch-amd-gpu-push-ci - - latest-tensorflow: - name: "Latest TensorFlow [dev]" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-22.04 - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-tensorflow-gpu + context: ./docker/transformers-pytorch-deepspeed-amd-gpu build-args: | REF=main push: true - tags: huggingface/transformers-tensorflow-gpu + tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci-test \ No newline at end of file diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 0bf9c092c87fd5..2cc44f553f4944 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -107,254 +107,254 @@ jobs: run: | python3 utils/print_env.py - run_tests_single_gpu: - name: Single GPU tests - strategy: - max-parallel: 1 # For now, not to parallelize. Can change later if it works well. - fail-fast: false - matrix: - folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - run_tests_multi_gpu: - name: Multi GPU tests - strategy: - max-parallel: 1 - fail-fast: false - matrix: - folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - run_examples_gpu: - name: Examples tests - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run examples tests on GPU - working-directory: /transformers - run: | - pip install -r examples/pytorch/_tests_requirements.txt - python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_examples_gpu - path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu - - run_pipelines_torch_gpu: - name: PyTorch pipelines tests - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all pipeline tests on GPU - working-directory: /transformers - run: | - python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu - path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu + # run_tests_single_gpu: + # name: Single GPU tests + # strategy: + # max-parallel: 1 # For now, not to parallelize. Can change later if it works well. + # fail-fast: false + # matrix: + # folders: ${{ fromJson(needs.setup.outputs.matrix) }} + # machine_type: [single-gpu] + # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + # container: + # image: huggingface/transformers-pytorch-amd-gpu + # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + # needs: setup + # steps: + # - name: Echo folder ${{ matrix.folders }} + # shell: bash + # # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # # set the artifact folder names (because the character `/` is not allowed). + # run: | + # echo "${{ matrix.folders }}" + # matrix_folders=${{ matrix.folders }} + # matrix_folders=${matrix_folders/'models/'/'models_'} + # echo "$matrix_folders" + # echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + # - name: Update clone + # working-directory: /transformers + # run: git fetch && git checkout ${{ github.sha }} + + # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + # working-directory: /transformers + # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + # - name: ROCM-SMI + # run: | + # rocm-smi + # - name: ROCM-INFO + # run: | + # rocminfo | grep "Agent" -A 14 + # - name: Show ROCR environment + # run: | + # echo "ROCR: $ROCR_VISIBLE_DEVICES" + + # - name: Environment + # working-directory: /transformers + # run: | + # python3 utils/print_env.py + + # - name: Show installed libraries and their versions + # working-directory: /transformers + # run: pip freeze + + # - name: Run all tests on GPU + # working-directory: /transformers + # run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + + # - name: Failure short reports + # if: ${{ failure() }} + # continue-on-error: true + # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + # - name: Test suite reports artifacts + # if: ${{ always() }} + # uses: actions/upload-artifact@v3 + # with: + # name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + # path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + + # run_tests_multi_gpu: + # name: Multi GPU tests + # strategy: + # max-parallel: 1 + # fail-fast: false + # matrix: + # folders: ${{ fromJson(needs.setup.outputs.matrix) }} + # machine_type: [multi-gpu] + # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + # container: + # image: huggingface/transformers-pytorch-amd-gpu + # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + # needs: setup + # steps: + # - name: Echo folder ${{ matrix.folders }} + # shell: bash + # # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # # set the artifact folder names (because the character `/` is not allowed). + # run: | + # echo "${{ matrix.folders }}" + # matrix_folders=${{ matrix.folders }} + # matrix_folders=${matrix_folders/'models/'/'models_'} + # echo "$matrix_folders" + # echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + # - name: Update clone + # working-directory: /transformers + # run: git fetch && git checkout ${{ github.sha }} + + # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + # working-directory: /transformers + # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + # - name: ROCM-SMI + # run: | + # rocm-smi + # - name: ROCM-INFO + # run: | + # rocminfo | grep "Agent" -A 14 + # - name: Show ROCR environment + # run: | + # echo "ROCR: $ROCR_VISIBLE_DEVICES" + + # - name: Environment + # working-directory: /transformers + # run: | + # python3 utils/print_env.py + + # - name: Show installed libraries and their versions + # working-directory: /transformers + # run: pip freeze + + # - name: Run all tests on GPU + # working-directory: /transformers + # run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + + # - name: Failure short reports + # if: ${{ failure() }} + # continue-on-error: true + # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + # - name: Test suite reports artifacts + # if: ${{ always() }} + # uses: actions/upload-artifact@v3 + # with: + # name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + # path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + + # run_examples_gpu: + # name: Examples tests + # strategy: + # fail-fast: false + # matrix: + # machine_type: [single-gpu] + # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + # container: + # image: huggingface/transformers-pytorch-amd-gpu + # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + # needs: setup + # steps: + # - name: Update clone + # working-directory: /transformers + # run: git fetch && git checkout ${{ github.sha }} + + # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + # working-directory: /transformers + # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + # - name: ROCM-SMI + # run: | + # rocm-smi + # - name: ROCM-INFO + # run: | + # rocminfo | grep "Agent" -A 14 + # - name: Show ROCR environment + # run: | + # echo "ROCR: $ROCR_VISIBLE_DEVICES" + + # - name: Environment + # working-directory: /transformers + # run: | + # python3 utils/print_env.py + + # - name: Show installed libraries and their versions + # working-directory: /transformers + # run: pip freeze + + # - name: Run examples tests on GPU + # working-directory: /transformers + # run: | + # pip install -r examples/pytorch/_tests_requirements.txt + # python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch + + # - name: Failure short reports + # if: ${{ failure() }} + # continue-on-error: true + # run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt + + # - name: Test suite reports artifacts + # if: ${{ always() }} + # uses: actions/upload-artifact@v3 + # with: + # name: ${{ matrix.machine_type }}_run_examples_gpu + # path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu + + # run_pipelines_torch_gpu: + # name: PyTorch pipelines tests + # strategy: + # fail-fast: false + # matrix: + # machine_type: [single-gpu, multi-gpu] + # runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + # container: + # image: huggingface/transformers-pytorch-amd-gpu + # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + # needs: setup + # steps: + # - name: Update clone + # working-directory: /transformers + # run: git fetch && git checkout ${{ github.sha }} + + # - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + # working-directory: /transformers + # run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + # - name: ROCM-SMI + # run: | + # rocm-smi + # - name: ROCM-INFO + # run: | + # rocminfo | grep "Agent" -A 14 + # - name: Show ROCR environment + # run: | + # echo "ROCR: $ROCR_VISIBLE_DEVICES" + + # - name: Environment + # working-directory: /transformers + # run: | + # python3 utils/print_env.py + + # - name: Show installed libraries and their versions + # working-directory: /transformers + # run: pip freeze + + # - name: Run all pipeline tests on GPU + # working-directory: /transformers + # run: | + # python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines + + # - name: Failure short reports + # if: ${{ failure() }} + # continue-on-error: true + # run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt + + # - name: Test suite reports artifacts + # if: ${{ always() }} + # uses: actions/upload-artifact@v3 + # with: + # name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu + # path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu run_all_tests_torch_rocm_deepspeed_gpu: name: Torch ROCm deepspeed tests diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile new file mode 100644 index 00000000000000..a5752ebbbeef72 --- /dev/null +++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile @@ -0,0 +1,19 @@ +FROM rocm/deepspeed:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1_DeepSpeed +LABEL maintainer="Hugging Face" + +ARG DEBIAN_FRONTEND=noninteractive + +# ARG PYTORCH='2.0.1' +# ARG ROCM='5.7' + +RUN python3 -m pip install --no-cache-dir --upgrade pip + +ARG REF=main +RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF +RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] + +# When installing in editable mode, `transformers` is not recognized as a package. +# this line must be added in order for python to be aware of transformers. +RUN cd transformers && python3 setup.py develop + +RUN python3 -c "from deepspeed.launcher.runner import main" \ No newline at end of file