From f012b3f67b1e48851bb3b5e507614b6505338fcb Mon Sep 17 00:00:00 2001 From: ydshieh Date: Thu, 28 Mar 2024 18:55:44 +0100 Subject: [PATCH] rebase --- .github/workflows/self-scheduled-caller.yml | 35 ++ .github/workflows/self-scheduled.yml | 645 ++++++++++---------- .github/workflows/slack-report.yml | 61 ++ utils/notification_service.py | 50 +- utils/split_model_tests.py | 1 + 5 files changed, 464 insertions(+), 328 deletions(-) create mode 100644 .github/workflows/self-scheduled-caller.yml create mode 100644 .github/workflows/slack-report.yml diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml new file mode 100644 index 00000000000000..5210f96812aa32 --- /dev/null +++ b/.github/workflows/self-scheduled-caller.yml @@ -0,0 +1,35 @@ +name: Self-hosted runner (scheduled) + +# Note that each job's dependencies go into a corresponding docker file. +# +# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is +# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at +# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile` + +on: + repository_dispatch: + schedule: + - cron: "17 2 * * *" + push: + branches: + - run_scheduled_ci* + - move_jobs_from_daily_ci + +jobs: + model-ci: + name: Model CI + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_tests_gpu + env_name_for_slack_report_channel: CI_SLACK_CHANNEL_DUMMY_TESTS + secrets: inherit + + + + quantization-ci: + name: Quantization CI + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_tests_quantization_torch_gpu + env_name_for_slack_report_channel: CI_SLACK_CHANNEL_DUMMY_TESTS + secrets: inherit diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 465c00dd13bbcd..456695a908786c 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -7,12 +7,14 @@ name: Self-hosted runner (scheduled) # `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile` on: - repository_dispatch: - schedule: - - cron: "17 2 * * *" - push: - branches: - - run_scheduled_ci* + workflow_call: + inputs: + job: + required: true + type: string + env_name_for_slack_report_channel: + required: true + type: string env: HF_HOME: /mnt/cache @@ -31,6 +33,7 @@ env: jobs: setup: + if: ${{ inputs.job == 'run_tests_gpu' }} name: Setup strategy: matrix: @@ -71,6 +74,7 @@ jobs: nvidia-smi run_tests_gpu: + if: ${{ inputs.job == 'run_tests_gpu' }} name: " " needs: setup strategy: @@ -85,219 +89,220 @@ jobs: slice_id: ${{ matrix.slice_id }} secrets: inherit - run_examples_gpu: - name: Examples directory - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] - container: - image: huggingface/transformers-all-latest-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run examples tests on GPU - working-directory: /transformers - run: | - pip install -r examples/pytorch/_tests_requirements.txt - python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu" - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_examples_gpu - path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu - - run_pipelines_torch_gpu: - name: PyTorch pipelines - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] - container: - image: huggingface/transformers-pytorch-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all pipeline tests on GPU - working-directory: /transformers - run: | - python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu" - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu - path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu - - run_pipelines_tf_gpu: - name: TensorFlow pipelines - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] - container: - image: huggingface/transformers-tensorflow-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: | - git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all pipeline tests on GPU - working-directory: /transformers - run: | - python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines - - - name: Failure short reports - if: ${{ always() }} - run: | - cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu" - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu - path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu - - run_all_tests_torch_cuda_extensions_gpu: - name: Torch CUDA extension tests - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] - needs: setup - container: - image: huggingface/transformers-pytorch-deepspeed-latest-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Update clone - working-directory: /workspace/transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /workspace/transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: Remove cached torch extensions - run: rm -rf /github/home/.cache/torch_extensions/ - - # To avoid unknown test failures - - name: Pre build DeepSpeed *again* - working-directory: /workspace - run: | - python3 -m pip uninstall -y deepspeed - DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /workspace/transformers - run: | - python utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /workspace/transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /workspace/transformers - run: | - python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports - path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu +# run_examples_gpu: +# name: Examples directory +# strategy: +# fail-fast: false +# matrix: +# machine_type: [single-gpu] +# runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] +# container: +# image: huggingface/transformers-all-latest-gpu +# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ +# needs: setup +# steps: +# - name: Update clone +# working-directory: /transformers +# run: git fetch && git checkout ${{ github.sha }} +# +# - name: Reinstall transformers in edit mode (remove the one installed during docker image build) +# working-directory: /transformers +# run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . +# +# - name: NVIDIA-SMI +# run: | +# nvidia-smi +# +# - name: Environment +# working-directory: /transformers +# run: | +# python3 utils/print_env.py +# +# - name: Show installed libraries and their versions +# working-directory: /transformers +# run: pip freeze +# +# - name: Run examples tests on GPU +# working-directory: /transformers +# run: | +# pip install -r examples/pytorch/_tests_requirements.txt +# python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch +# +# - name: Failure short reports +# if: ${{ failure() }} +# continue-on-error: true +# run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt +# +# - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu" +# if: ${{ always() }} +# uses: actions/upload-artifact@v3 +# with: +# name: ${{ matrix.machine_type }}_run_examples_gpu +# path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu +# +# run_pipelines_torch_gpu: +# name: PyTorch pipelines +# strategy: +# fail-fast: false +# matrix: +# machine_type: [single-gpu, multi-gpu] +# runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] +# container: +# image: huggingface/transformers-pytorch-gpu +# options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ +# needs: setup +# steps: +# - name: Update clone +# working-directory: /transformers +# run: git fetch && git checkout ${{ github.sha }} +# +# - name: Reinstall transformers in edit mode (remove the one installed during docker image build) +# working-directory: /transformers +# run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . +# +# - name: NVIDIA-SMI +# run: | +# nvidia-smi +# +# - name: Environment +# working-directory: /transformers +# run: | +# python3 utils/print_env.py +# +# - name: Show installed libraries and their versions +# working-directory: /transformers +# run: pip freeze +# +# - name: Run all pipeline tests on GPU +# working-directory: /transformers +# run: | +# python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines +# +# - name: Failure short reports +# if: ${{ failure() }} +# continue-on-error: true +# run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt +# +# - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu" +# if: ${{ always() }} +# uses: actions/upload-artifact@v3 +# with: +# name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu +# path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu +# +# run_pipelines_tf_gpu: +# name: TensorFlow pipelines +# strategy: +# fail-fast: false +# matrix: +# machine_type: [single-gpu, multi-gpu] +# runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] +# container: +# image: huggingface/transformers-tensorflow-gpu +# options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ +# needs: setup +# steps: +# - name: Update clone +# working-directory: /transformers +# run: | +# git fetch && git checkout ${{ github.sha }} +# +# - name: Reinstall transformers in edit mode (remove the one installed during docker image build) +# working-directory: /transformers +# run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . +# +# - name: NVIDIA-SMI +# run: | +# nvidia-smi +# +# - name: Environment +# working-directory: /transformers +# run: | +# python3 utils/print_env.py +# +# - name: Show installed libraries and their versions +# working-directory: /transformers +# run: pip freeze +# +# - name: Run all pipeline tests on GPU +# working-directory: /transformers +# run: | +# python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines +# +# - name: Failure short reports +# if: ${{ always() }} +# run: | +# cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt +# +# - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu" +# if: ${{ always() }} +# uses: actions/upload-artifact@v3 +# with: +# name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu +# path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu +# +# run_all_tests_torch_cuda_extensions_gpu: +# name: Torch CUDA extension tests +# strategy: +# fail-fast: false +# matrix: +# machine_type: [single-gpu, multi-gpu] +# runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] +# needs: setup +# container: +# image: huggingface/transformers-pytorch-deepspeed-latest-gpu +# options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ +# steps: +# - name: Update clone +# working-directory: /workspace/transformers +# run: git fetch && git checkout ${{ github.sha }} +# +# - name: Reinstall transformers in edit mode (remove the one installed during docker image build) +# working-directory: /workspace/transformers +# run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . +# +# - name: Remove cached torch extensions +# run: rm -rf /github/home/.cache/torch_extensions/ +# +# # To avoid unknown test failures +# - name: Pre build DeepSpeed *again* +# working-directory: /workspace +# run: | +# python3 -m pip uninstall -y deepspeed +# DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check +# +# - name: NVIDIA-SMI +# run: | +# nvidia-smi +# +# - name: Environment +# working-directory: /workspace/transformers +# run: | +# python utils/print_env.py +# +# - name: Show installed libraries and their versions +# working-directory: /workspace/transformers +# run: pip freeze +# +# - name: Run all tests on GPU +# working-directory: /workspace/transformers +# run: | +# python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended +# +# - name: Failure short reports +# if: ${{ failure() }} +# continue-on-error: true +# run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt +# +# - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports" +# if: ${{ always() }} +# uses: actions/upload-artifact@v3 +# with: +# name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports +# path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu run_tests_quantization_torch_gpu: + if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }} name: Quantization tests strategy: fail-fast: false @@ -307,7 +312,6 @@ jobs: container: image: huggingface/transformers-quantization-latest-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup steps: - name: Update clone working-directory: /transformers @@ -347,101 +351,114 @@ jobs: name: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu - run_extract_warnings: - name: Extract warnings in CI artifacts - runs-on: ubuntu-22.04 - if: always() - needs: [ - setup, - run_tests_gpu, - run_examples_gpu, - run_pipelines_tf_gpu, - run_pipelines_torch_gpu, - run_all_tests_torch_cuda_extensions_gpu, - run_tests_quantization_torch_gpu, - ] - steps: - - name: Checkout transformers - uses: actions/checkout@v3 - with: - fetch-depth: 2 - - - name: Install transformers - run: pip install transformers - - - name: Show installed libraries and their versions - run: pip freeze - - - name: Create output directory - run: mkdir warnings_in_ci - - - uses: actions/download-artifact@v3 - with: - path: warnings_in_ci - - - name: Show artifacts - run: echo "$(python3 -c 'import os; d = os.listdir(); print(d)')" - working-directory: warnings_in_ci - - - name: Extract warnings in CI artifacts - run: | - python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh - echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')" - - - name: Upload artifact - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: warnings_in_ci - path: warnings_in_ci/selected_warnings.json +# +# run_extract_warnings: +# name: Extract warnings in CI artifacts +# runs-on: ubuntu-22.04 +# if: always() +# needs: [ +# setup, +# run_tests_gpu, +# run_examples_gpu, +# run_pipelines_tf_gpu, +# run_pipelines_torch_gpu, +# run_all_tests_torch_cuda_extensions_gpu, +# run_tests_quantization_torch_gpu, +# ] +# steps: +# - name: Checkout transformers +# uses: actions/checkout@v3 +# with: +# fetch-depth: 2 +# +# - name: Install transformers +# run: pip install transformers +# +# - name: Show installed libraries and their versions +# run: pip freeze +# +# - name: Create output directory +# run: mkdir warnings_in_ci +# +# - uses: actions/download-artifact@v3 +# with: +# path: warnings_in_ci +# +# - name: Show artifacts +# run: echo "$(python3 -c 'import os; d = os.listdir(); print(d)')" +# working-directory: warnings_in_ci +# +# - name: Extract warnings in CI artifacts +# run: | +# python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh +# echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')" +# +# - name: Upload artifact +# if: ${{ always() }} +# uses: actions/upload-artifact@v3 +# with: +# name: warnings_in_ci +# path: warnings_in_ci/selected_warnings.json +# +# send_results: +# name: Send results to webhook +# runs-on: ubuntu-22.04 +# if: always() +# needs: [ +# setup, +# run_tests_gpu, +# run_examples_gpu, +# run_pipelines_tf_gpu, +# run_pipelines_torch_gpu, +# run_all_tests_torch_cuda_extensions_gpu, +# run_tests_quantization_torch_gpu, +# run_extract_warnings +# ] +# steps: +# - name: Preliminary job status +# shell: bash +# # For the meaning of these environment variables, see the job `Setup` +# run: | +# echo "Setup status: ${{ needs.setup.result }}" +# +# - uses: actions/checkout@v3 +# - uses: actions/download-artifact@v3 +# - name: Send message to Slack +# env: +# CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} +# CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} +# CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} +# CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} +# CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} +# ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} +# CI_EVENT: scheduled +# CI_SHA: ${{ github.sha }} +# CI_WORKFLOW_REF: ${{ github.workflow_ref }} +# SETUP_STATUS: ${{ needs.setup.result }} +# # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change +# # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. +# run: | +# sudo apt-get install -y curl +# pip install slack_sdk +# pip show slack_sdk +# python utils/notification_service.py "${{ needs.setup.outputs.folder_slices }}" +# +# # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. +# - name: Failure table artifacts +# if: ${{ always() }} +# uses: actions/upload-artifact@v3 +# with: +# name: prev_ci_results +# path: prev_ci_results send_results: - name: Send results to webhook - runs-on: ubuntu-22.04 - if: always() - needs: [ - setup, - run_tests_gpu, - run_examples_gpu, - run_pipelines_tf_gpu, - run_pipelines_torch_gpu, - run_all_tests_torch_cuda_extensions_gpu, - run_tests_quantization_torch_gpu, - run_extract_warnings - ] - steps: - - name: Preliminary job status - shell: bash - # For the meaning of these environment variables, see the job `Setup` - run: | - echo "Setup status: ${{ needs.setup.result }}" - - - uses: actions/checkout@v3 - - uses: actions/download-artifact@v3 - - name: Send message to Slack - env: - CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} - CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} - CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} - CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} - CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} - ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} - CI_EVENT: scheduled - CI_SHA: ${{ github.sha }} - CI_WORKFLOW_REF: ${{ github.workflow_ref }} - SETUP_STATUS: ${{ needs.setup.result }} - # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change - # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. - run: | - sudo apt-get install -y curl - pip install slack_sdk - pip show slack_sdk - python utils/notification_service.py "${{ needs.setup.outputs.folder_slices }}" - - # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. - - name: Failure table artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: prev_ci_results - path: prev_ci_results + name: Slack Report + needs: [setup, run_tests_gpu, run_tests_quantization_torch_gpu] + if: ${{ always() }} + uses: ./.github/workflows/slack-report.yml + with: + job: ${{ inputs.job }} + setup_status: ${{ needs.setup.result }} + env_name_for_slack_report_channel: ${{ inputs.env_name_for_slack_report_channel }} + folder_slices: ${{ needs.setup.outputs.folder_slices }} + secrets: inherit \ No newline at end of file diff --git a/.github/workflows/slack-report.yml b/.github/workflows/slack-report.yml new file mode 100644 index 00000000000000..0ca567818f8f15 --- /dev/null +++ b/.github/workflows/slack-report.yml @@ -0,0 +1,61 @@ +name: CI slack report + +on: + workflow_call: + inputs: + job: + required: true + type: string + env_name_for_slack_report_channel: + required: true + type: string + setup_status: + required: true + type: string + folder_slices: + required: true + type: string + + +jobs: + send_results: + name: Send results to webhook + runs-on: ubuntu-22.04 + if: always() + steps: + - name: Preliminary job status + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + echo "Setup status: ${{ inputs.setup_status }}" + + - uses: actions/checkout@v3 + - uses: actions/download-artifact@v3 + - name: Send message to Slack + env: + CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} + CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} + CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} + CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} + ENV_NAME_FOR_CI_SLACK_REPORT_CHANNEL_ID: ${{ inputs.env_name_for_slack_report_channel }} + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + CI_EVENT: scheduled + CI_SHA: ${{ github.sha }} + CI_WORKFLOW_REF: ${{ github.workflow_ref }} + CI_TEST_JOB: ${{ inputs.job }} + SETUP_STATUS: ${{ inputs.setup_status }} + # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change + # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. + run: | + sudo apt-get install -y curl + pip install slack_sdk + pip show slack_sdk + python utils/notification_service.py "${{ inputs.folder_slices }}" + +# # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. +# - name: Failure table artifacts +# if: ${{ always() }} +# uses: actions/upload-artifact@v3 +# with: +# name: prev_ci_results +# path: prev_ci_results diff --git a/utils/notification_service.py b/utils/notification_service.py index d29e6994a232b2..10a8fd74b04af4 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -573,7 +573,7 @@ def error_out(title, ci_title="", runner_not_available=False, runner_failed=Fals print(json.dumps({"blocks": blocks})) client.chat_postMessage( - channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"], + channel=CI_SLACK_REPORT_CHANNEL_ID, text=text, blocks=payload, ) @@ -586,7 +586,7 @@ def post(self): text = f"{self.n_failures} failures out of {self.n_tests} tests," if self.n_failures else "All tests passed." self.thread_ts = client.chat_postMessage( - channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"], + channel=CI_SLACK_REPORT_CHANNEL_ID, blocks=payload, text=text, ) @@ -712,7 +712,7 @@ def post_reply(self): print(json.dumps({"blocks": blocks})) client.chat_postMessage( - channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"], + channel=CI_SLACK_REPORT_CHANNEL_ID, text=f"Results for {job}", blocks=blocks, thread_ts=self.thread_ts["ts"], @@ -735,7 +735,7 @@ def post_reply(self): print(json.dumps({"blocks": blocks})) client.chat_postMessage( - channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"], + channel=CI_SLACK_REPORT_CHANNEL_ID, text=f"Results for {job}", blocks=blocks, thread_ts=self.thread_ts["ts"], @@ -749,7 +749,7 @@ def post_reply(self): print(json.dumps({"blocks": blocks})) client.chat_postMessage( - channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"], + channel=CI_SLACK_REPORT_CHANNEL_ID, text="Results for new failures", blocks=blocks, thread_ts=self.thread_ts["ts"], @@ -852,6 +852,10 @@ def prepare_reports(title, header, reports, to_truncate=True): if __name__ == "__main__": + + ENV_NAME_FOR_CI_SLACK_REPORT_CHANNEL_ID = os.environ["ENV_NAME_FOR_CI_SLACK_REPORT_CHANNEL_ID"] + CI_SLACK_REPORT_CHANNEL_ID = os.environ[ENV_NAME_FOR_CI_SLACK_REPORT_CHANNEL_ID] + # runner_status = os.environ.get("RUNNER_STATUS") # runner_env_status = os.environ.get("RUNNER_ENV_STATUS") setup_status = os.environ.get("SETUP_STATUS") @@ -861,7 +865,7 @@ def prepare_reports(title, header, reports, to_truncate=True): # Let's keep the lines regardig runners' status (we might be able to use them again in the future) runner_not_available = False runner_failed = False - setup_failed = True if setup_status is not None and setup_status != "success" else False + setup_failed = False if setup_status in ["skipped", "success"] else True org = "huggingface" repo = "transformers" @@ -929,14 +933,18 @@ def prepare_reports(title, header, reports, to_truncate=True): Message.error_out(title, ci_title, runner_not_available, runner_failed, setup_failed) exit(0) - arguments = sys.argv[1:][0] - try: - folder_slices = ast.literal_eval(arguments) - # Need to change from elements like `models/bert` to `models_bert` (the ones used as artifact names). - models = [x.replace("models/", "models_") for folders in folder_slices for x in folders] - except SyntaxError: - Message.error_out(title, ci_title) - raise ValueError("Errored out.") + arguments = sys.argv[1:] + if len(arguments) == 0 or arguments[0] == "": + models = [] + else: + model_list_as_str = arguments[0] + try: + folder_slices = ast.literal_eval(model_list_as_str) + # Need to change from elements like `models/bert` to `models_bert` (the ones used as artifact names). + models = [x.replace("models/", "models_") for folders in folder_slices for x in folders] + except: + Message.error_out(title, ci_title) + raise ValueError("Errored out.") github_actions_jobs = get_jobs( workflow_run_id=os.environ["GITHUB_RUN_ID"], token=os.environ["ACCESS_REPO_INFO_TOKEN"] @@ -1038,6 +1046,13 @@ def prepare_reports(title, header, reports, to_truncate=True): unclassified_model_failures.append(line) # Additional runs + job_to_test_map = { + # "": "Examples directory", + # "": "PyTorch pipelines", + # "": "TensorFlow pipelines", + # "": "Torch CUDA extension tests", + "run_tests_quantization_torch_gpu": "Quantization tests", + } additional_files = { "Examples directory": "run_examples_gpu", "PyTorch pipelines": "run_tests_torch_pipeline_gpu", @@ -1056,6 +1071,13 @@ def prepare_reports(title, header, reports, to_truncate=True): elif ci_event.startswith("Push CI (AMD)"): additional_files = {} + test_name = None + job_name = os.getenv("CI_TEST_JOB") + if job_name in job_to_test_map: + test_name = job_to_test_map[job_name] + + additional_files = {k: v for k, v in additional_files.items() if k == test_name} + additional_results = { key: { "failed": {"unclassified": 0, "single": 0, "multi": 0}, diff --git a/utils/split_model_tests.py b/utils/split_model_tests.py index fc8800ffcf1c48..7b97e2cbc01d3a 100644 --- a/utils/split_model_tests.py +++ b/utils/split_model_tests.py @@ -62,4 +62,5 @@ start = end end = start + num_jobs_per_splits + (1 if idx < num_jobs % args.num_splits else 0) model_splits.append(d[start:end]) + model_splits = [["models/bertweet"], ["models/byt5"]] print(model_splits)