From 4f9a39a862b9f5d91c54e531ecec38681a0f6818 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Tue, 2 Apr 2024 15:55:55 +0200 Subject: [PATCH] Add torch/tf pipeline --- .github/workflows/self-scheduled-caller.yml | 18 ++ .github/workflows/self-scheduled.yml | 206 ++++++++++---------- utils/notification_service.py | 6 +- 3 files changed, 125 insertions(+), 105 deletions(-) diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml index 90e2b889dabe3b..2eabb622752c2b 100644 --- a/.github/workflows/self-scheduled-caller.yml +++ b/.github/workflows/self-scheduled-caller.yml @@ -25,6 +25,24 @@ jobs: env_name_for_slack_report_channel: CI_SLACK_CHANNEL_DUMMY_TESTS secrets: inherit + torch-pipeline: + name: Torch pipeline CI + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_pipelines_torch_gpu + # See the comment for `ENV_NAME_FOR_CI_SLACK_REPORT_CHANNEL_ID` in `.github/workflows/slack-report.yml`. + env_name_for_slack_report_channel: CI_SLACK_CHANNEL_DUMMY_TESTS + secrets: inherit + + tf-pipeline: + name: TF pipeline CI + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_pipelines_tf_gpu + # See the comment for `ENV_NAME_FOR_CI_SLACK_REPORT_CHANNEL_ID` in `.github/workflows/slack-report.yml`. + env_name_for_slack_report_channel: CI_SLACK_CHANNEL_DUMMY_TESTS + secrets: inherit + example-ci: name: Example CI uses: ./.github/workflows/self-scheduled.yml diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 71dde7d67a1684..1fd80d44244282 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -89,6 +89,109 @@ jobs: slice_id: ${{ matrix.slice_id }} secrets: inherit + run_pipelines_torch_gpu: + if: ${{ inputs.job == 'run_pipelines_torch_gpu' }} + name: PyTorch pipelines + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] + container: + image: huggingface/transformers-pytorch-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all pipeline tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt + + - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu" + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu + path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu + + run_pipelines_tf_gpu: + if: ${{ inputs.job == 'run_pipelines_tf_gpu' }} + name: TensorFlow pipelines + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] + container: + image: huggingface/transformers-tensorflow-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Update clone + working-directory: /transformers + run: | + git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all pipeline tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines + + - name: Failure short reports + if: ${{ always() }} + run: | + cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt + + - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu" + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu + path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu + run_examples_gpu: if: ${{ inputs.job == 'run_examples_gpu' }} name: Examples directory @@ -140,107 +243,6 @@ jobs: name: ${{ matrix.machine_type }}_run_examples_gpu path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu -# run_pipelines_torch_gpu: -# name: PyTorch pipelines -# strategy: -# fail-fast: false -# matrix: -# machine_type: [single-gpu, multi-gpu] -# runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] -# container: -# image: huggingface/transformers-pytorch-gpu -# options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -# needs: setup -# steps: -# - name: Update clone -# working-directory: /transformers -# run: git fetch && git checkout ${{ github.sha }} -# -# - name: Reinstall transformers in edit mode (remove the one installed during docker image build) -# working-directory: /transformers -# run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . -# -# - name: NVIDIA-SMI -# run: | -# nvidia-smi -# -# - name: Environment -# working-directory: /transformers -# run: | -# python3 utils/print_env.py -# -# - name: Show installed libraries and their versions -# working-directory: /transformers -# run: pip freeze -# -# - name: Run all pipeline tests on GPU -# working-directory: /transformers -# run: | -# python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines -# -# - name: Failure short reports -# if: ${{ failure() }} -# continue-on-error: true -# run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt -# -# - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu" -# if: ${{ always() }} -# uses: actions/upload-artifact@v3 -# with: -# name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu -# path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu -# -# run_pipelines_tf_gpu: -# name: TensorFlow pipelines -# strategy: -# fail-fast: false -# matrix: -# machine_type: [single-gpu, multi-gpu] -# runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] -# container: -# image: huggingface/transformers-tensorflow-gpu -# options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -# needs: setup -# steps: -# - name: Update clone -# working-directory: /transformers -# run: | -# git fetch && git checkout ${{ github.sha }} -# -# - name: Reinstall transformers in edit mode (remove the one installed during docker image build) -# working-directory: /transformers -# run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . -# -# - name: NVIDIA-SMI -# run: | -# nvidia-smi -# -# - name: Environment -# working-directory: /transformers -# run: | -# python3 utils/print_env.py -# -# - name: Show installed libraries and their versions -# working-directory: /transformers -# run: pip freeze -# -# - name: Run all pipeline tests on GPU -# working-directory: /transformers -# run: | -# python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines -# -# - name: Failure short reports -# if: ${{ always() }} -# run: | -# cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt -# -# - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu" -# if: ${{ always() }} -# uses: actions/upload-artifact@v3 -# with: -# name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu -# path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu -# run_all_tests_torch_cuda_extensions_gpu: if: ${{ inputs.job == 'run_all_tests_torch_cuda_extensions_gpu' }} name: Torch CUDA extension tests @@ -394,7 +396,7 @@ jobs: send_results: name: Slack Report - needs: [setup, run_tests_gpu, run_examples_gpu, run_all_tests_torch_cuda_extensions_gpu, run_tests_quantization_torch_gpu, run_extract_warnings] + needs: [setup, run_tests_gpu, run_pipelines_torch_gpu, run_pipelines_tf_gpu, run_examples_gpu, run_all_tests_torch_cuda_extensions_gpu, run_tests_quantization_torch_gpu, run_extract_warnings] if: ${{ always() }} uses: ./.github/workflows/slack-report.yml with: diff --git a/utils/notification_service.py b/utils/notification_service.py index e20d8c632fc5ea..b6b200d9562251 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -1058,9 +1058,9 @@ def prepare_reports(title, header, reports, to_truncate=True): # Additional runs additional_files = { - "Examples directory": "run_examples_gpu", "PyTorch pipelines": "run_tests_torch_pipeline_gpu", "TensorFlow pipelines": "run_tests_tf_pipeline_gpu", + "Examples directory": "run_examples_gpu", "Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports", "Quantization tests": "run_tests_quantization_torch_gpu", } @@ -1079,9 +1079,9 @@ def prepare_reports(title, header, reports, to_truncate=True): # `additional_files`. This is used to remove some entries in `additional_files` that are not concerned by a # specific job. See below. job_to_test_map = { + "run_pipelines_torch_gpu": "PyTorch pipelines", + "run_pipelines_tf_gpu": "TensorFlow pipelines", "run_examples_gpu": "Examples directory", - # "": "PyTorch pipelines", - # "": "TensorFlow pipelines", "run_all_tests_torch_cuda_extensions_gpu": "Torch CUDA extension tests", "run_tests_quantization_torch_gpu": "Quantization tests", }