diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml index 75ea3bb24bc7fa..5634ae1d8fe44d 100644 --- a/.github/workflows/self-scheduled-caller.yml +++ b/.github/workflows/self-scheduled-caller.yml @@ -7,7 +7,7 @@ on: - cron: "17 2 * * *" push: branches: - - run_scheduled_ci* + - drop_py38_build_img jobs: model-ci: @@ -17,62 +17,6 @@ jobs: job: run_models_gpu slack_report_channel: "#transformers-ci-daily-models" runner: daily-ci - docker: huggingface/transformers-all-latest-gpu - ci_event: Daily CI - secrets: inherit - - torch-pipeline: - name: Torch pipeline CI - uses: ./.github/workflows/self-scheduled.yml - with: - job: run_pipelines_torch_gpu - slack_report_channel: "#transformers-ci-daily-pipeline-torch" - runner: daily-ci - docker: huggingface/transformers-pytorch-gpu - ci_event: Daily CI - secrets: inherit - - tf-pipeline: - name: TF pipeline CI - uses: ./.github/workflows/self-scheduled.yml - with: - job: run_pipelines_tf_gpu - slack_report_channel: "#transformers-ci-daily-pipeline-tf" - runner: daily-ci - docker: huggingface/transformers-tensorflow-gpu - ci_event: Daily CI - secrets: inherit - - example-ci: - name: Example CI - uses: ./.github/workflows/self-scheduled.yml - with: - job: run_examples_gpu - slack_report_channel: "#transformers-ci-daily-examples" - runner: daily-ci - docker: huggingface/transformers-all-latest-gpu - ci_event: Daily CI - secrets: inherit - - deepspeed-ci: - name: DeepSpeed CI - uses: ./.github/workflows/self-scheduled.yml - with: - job: run_torch_cuda_extensions_gpu - slack_report_channel: "#transformers-ci-daily-deepspeed" - runner: daily-ci - docker: huggingface/transformers-pytorch-deepspeed-latest-gpu - ci_event: Daily CI - working-directory-prefix: /workspace - secrets: inherit - - quantization-ci: - name: Quantization CI - uses: ./.github/workflows/self-scheduled.yml - with: - job: run_quantization_torch_gpu - slack_report_channel: "#transformers-ci-daily-quantization" - runner: daily-ci - docker: huggingface/transformers-quantization-latest-gpu + docker: huggingface/transformers-all-latest-gpu-test ci_event: Daily CI secrets: inherit diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 353fb59843e4a5..654ecaef693cc4 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -50,11 +50,11 @@ jobs: name: Setup strategy: matrix: - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + machine_type: [aws-g4dn-2xlarge-cache] runs-on: group: '${{ matrix.machine_type }}' container: - image: huggingface/transformers-all-latest-gpu + image: huggingface/transformers-all-latest-gpu-test options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ outputs: folder_slices: ${{ steps.set-matrix.outputs.folder_slices }} @@ -103,7 +103,7 @@ jobs: strategy: fail-fast: false matrix: - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + machine_type: [aws-g4dn-2xlarge-cache] slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }} uses: ./.github/workflows/model_jobs.yml with: @@ -113,462 +113,3 @@ jobs: runner: ${{ inputs.runner }} docker: ${{ inputs.docker }} secrets: inherit - - run_pipelines_torch_gpu: - if: ${{ inputs.job == 'run_pipelines_torch_gpu' }} - name: PyTorch pipelines - strategy: - fail-fast: false - matrix: - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] - runs-on: - group: '${{ matrix.machine_type }}' - container: - image: huggingface/transformers-pytorch-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Set `machine_type` for report and artifact names - working-directory: /transformers - shell: bash - run: | - echo "${{ matrix.machine_type }}" - - if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then - machine_type=single-gpu - elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then - machine_type=multi-gpu - else - machine_type=${{ matrix.machine_type }} - fi - - echo "$machine_type" - echo "machine_type=$machine_type" >> $GITHUB_ENV - - - name: Run all pipeline tests on GPU - working-directory: /transformers - run: | - python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt - - - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports - path: /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports - - run_pipelines_tf_gpu: - if: ${{ inputs.job == 'run_pipelines_tf_gpu' }} - name: TensorFlow pipelines - strategy: - fail-fast: false - matrix: - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] - runs-on: - group: '${{ matrix.machine_type }}' - container: - image: huggingface/transformers-tensorflow-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Update clone - working-directory: /transformers - run: | - git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Set `machine_type` for report and artifact names - working-directory: /transformers - shell: bash - run: | - echo "${{ matrix.machine_type }}" - - if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then - machine_type=single-gpu - elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then - machine_type=multi-gpu - else - machine_type=${{ matrix.machine_type }} - fi - - echo "$machine_type" - echo "machine_type=$machine_type" >> $GITHUB_ENV - - - name: Run all pipeline tests on GPU - working-directory: /transformers - run: | - python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports tests/pipelines - - - name: Failure short reports - if: ${{ always() }} - run: | - cat /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports/failures_short.txt - - - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports - path: /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports - - run_examples_gpu: - if: ${{ inputs.job == 'run_examples_gpu' }} - name: Examples directory - strategy: - fail-fast: false - matrix: - machine_type: [aws-g4dn-2xlarge-cache] - runs-on: - group: '${{ matrix.machine_type }}' - container: - image: huggingface/transformers-all-latest-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Set `machine_type` for report and artifact names - working-directory: /transformers - shell: bash - run: | - echo "${{ matrix.machine_type }}" - - if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then - machine_type=single-gpu - elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then - machine_type=multi-gpu - else - machine_type=${{ matrix.machine_type }} - fi - - echo "$machine_type" - echo "machine_type=$machine_type" >> $GITHUB_ENV - - - name: Run examples tests on GPU - working-directory: /transformers - run: | - pip install -r examples/pytorch/_tests_requirements.txt - python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt - - - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ env.machine_type }}_run_examples_gpu_test_reports - path: /transformers/reports/${{ env.machine_type }}_run_examples_gpu_test_reports - - run_torch_cuda_extensions_gpu: - if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }} - name: Torch CUDA extension tests - strategy: - fail-fast: false - matrix: - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] - runs-on: - group: '${{ matrix.machine_type }}' - container: - image: ${{ inputs.docker }} - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Update clone - working-directory: ${{ inputs.working-directory-prefix }}/transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: ${{ inputs.working-directory-prefix }}/transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: Update / Install some packages (for Past CI) - if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }} - working-directory: ${{ inputs.working-directory-prefix }}/transformers - run: | - python3 -m pip install -U datasets - python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate - - - name: Remove cached torch extensions - run: rm -rf /github/home/.cache/torch_extensions/ - - # To avoid unknown test failures - - name: Pre build DeepSpeed *again* (for daily CI) - if: ${{ contains(inputs.ci_event, 'Daily CI') }} - working-directory: ${{ inputs.working-directory-prefix }}/ - run: | - python3 -m pip uninstall -y deepspeed - DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - - # To avoid unknown test failures - - name: Pre build DeepSpeed *again* (for nightly & Past CI) - if: ${{ contains(inputs.ci_event, 'Nightly CI') || contains(inputs.ci_event, 'Past CI') }} - working-directory: ${{ inputs.working-directory-prefix }}/ - run: | - python3 -m pip uninstall -y deepspeed - rm -rf DeepSpeed - git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: ${{ inputs.working-directory-prefix }}/transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: ${{ inputs.working-directory-prefix }}/transformers - run: pip freeze - - - name: Set `machine_type` for report and artifact names - working-directory: /transformers - shell: bash - run: | - echo "${{ matrix.machine_type }}" - - if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then - machine_type=single-gpu - elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then - machine_type=multi-gpu - else - machine_type=${{ matrix.machine_type }} - fi - - echo "$machine_type" - echo "machine_type=$machine_type" >> $GITHUB_ENV - - - name: Run all tests on GPU - working-directory: ${{ inputs.working-directory-prefix }}/transformers - run: | - python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat ${{ inputs.working-directory-prefix }}/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt - - - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports - path: ${{ inputs.working-directory-prefix }}/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports - - run_quantization_torch_gpu: - if: ${{ inputs.job == 'run_quantization_torch_gpu' }} - name: " " - needs: setup - strategy: - max-parallel: 4 - fail-fast: false - matrix: - folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }} - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] - runs-on: - group: '${{ matrix.machine_type }}' - container: - image: huggingface/transformers-quantization-latest-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Echo folder ${{ matrix.folders }} - shell: bash - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'quantization/'/'quantization_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Set `machine_type` for report and artifact names - working-directory: /transformers - shell: bash - run: | - echo "${{ matrix.machine_type }}" - - if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then - machine_type=single-gpu - elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then - machine_type=multi-gpu - else - machine_type=${{ matrix.machine_type }} - fi - - echo "$machine_type" - echo "machine_type=$machine_type" >> $GITHUB_ENV - - - name: Run quantization tests on GPU - working-directory: /transformers - run: | - python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt - - - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports - - run_extract_warnings: - # Let's only do this for the job `run_models_gpu` to simplify the (already complex) logic. - if: ${{ always() && inputs.job == 'run_models_gpu' }} - name: Extract warnings in CI artifacts - runs-on: ubuntu-22.04 - needs: [setup, run_models_gpu] - steps: - - name: Checkout transformers - uses: actions/checkout@v4 - with: - fetch-depth: 2 - - - name: Install transformers - run: pip install transformers - - - name: Show installed libraries and their versions - run: pip freeze - - - name: Create output directory - run: mkdir warnings_in_ci - - - uses: actions/download-artifact@v4 - with: - path: warnings_in_ci - - - name: Show artifacts - run: echo "$(python3 -c 'import os; d = os.listdir(); print(d)')" - working-directory: warnings_in_ci - - - name: Extract warnings in CI artifacts - run: | - python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh - echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')" - - - name: Upload artifact - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: warnings_in_ci - path: warnings_in_ci/selected_warnings.json - - send_results: - name: Slack Report - needs: [ - setup, - run_models_gpu, - run_pipelines_torch_gpu, - run_pipelines_tf_gpu, - run_examples_gpu, - run_torch_cuda_extensions_gpu, - run_quantization_torch_gpu, - run_extract_warnings - ] - if: ${{ always() }} - uses: ./.github/workflows/slack-report.yml - with: - job: ${{ inputs.job }} - # This would be `skipped` if `setup` is skipped. - setup_status: ${{ needs.setup.result }} - slack_report_channel: ${{ inputs.slack_report_channel }} - # This would be an empty string if `setup` is skipped. - folder_slices: ${{ needs.setup.outputs.folder_slices }} - quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }} - ci_event: ${{ inputs.ci_event }} - - secrets: inherit - - check_new_model_failures: - if: ${{ always() && inputs.ci_event == 'Daily CI' && inputs.job == 'run_models_gpu' && needs.send_results.result == 'success' }} - name: Check new model failures - needs: send_results - uses: ./.github/workflows/check_failed_model_tests.yml - with: - docker: ${{ inputs.docker }} - start_sha: ${{ github.sha }} - secrets: inherit \ No newline at end of file diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 08e37ea6e1292f..f3de6b7a26f5ef 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile index 2c1f153eef275e..62578ad0f3610f 100644 --- a/docker/transformers-pytorch-gpu/Dockerfile +++ b/docker/transformers-pytorch-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index 0617ac8cdd779c..d1cb8e7b7a4609 100755 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive diff --git a/docker/transformers-tensorflow-gpu/Dockerfile b/docker/transformers-tensorflow-gpu/Dockerfile index adccee1ace4998..d765767780f46c 100644 --- a/docker/transformers-tensorflow-gpu/Dockerfile +++ b/docker/transformers-tensorflow-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive diff --git a/utils/split_model_tests.py b/utils/split_model_tests.py index e5083aaeb46fa5..958b872c4a50cd 100644 --- a/utils/split_model_tests.py +++ b/utils/split_model_tests.py @@ -62,4 +62,5 @@ start = end end = start + num_jobs_per_splits + (1 if idx < num_jobs % args.num_splits else 0) model_splits.append(d[start:end]) + model_splits = [["models/vit"]] print(model_splits)