Skip to content

Commit

Permalink
try new cluster
Browse files Browse the repository at this point in the history
  • Loading branch information
ydshieh committed Aug 28, 2024
1 parent abb454a commit 924ead2
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 363 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/model_jobs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ jobs:
fail-fast: false
matrix:
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
runs-on: ['${{ inputs.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}']
runs-on:
group: '${{ inputs.machine_type }}'
container:
image: ${{ inputs.docker }}
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
Expand Down
366 changes: 4 additions & 362 deletions .github/workflows/self-scheduled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ jobs:
name: Setup
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}']
machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
runs-on:
group: '${{ matrix.machine_type }}'
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
Expand Down Expand Up @@ -102,7 +103,7 @@ jobs:
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu, multi-gpu]
machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
uses: ./.github/workflows/model_jobs.yml
with:
Expand All @@ -112,362 +113,3 @@ jobs:
runner: ${{ inputs.runner }}
docker: ${{ inputs.docker }}
secrets: inherit

run_pipelines_torch_gpu:
if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
name: PyTorch pipelines
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}']
container:
image: huggingface/transformers-pytorch-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}

- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .

- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze

- name: Run all pipeline tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt

- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports
path: /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports

run_pipelines_tf_gpu:
if: ${{ inputs.job == 'run_pipelines_tf_gpu' }}
name: TensorFlow pipelines
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}']
container:
image: huggingface/transformers-tensorflow-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Update clone
working-directory: /transformers
run: |
git fetch && git checkout ${{ github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .

- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze

- name: Run all pipeline tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_tf_gpu_test_reports tests/pipelines
- name: Failure short reports
if: ${{ always() }}
run: |
cat /transformers/reports/${{ matrix.machine_type }}_run_pipelines_tf_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_pipelines_tf_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.machine_type }}_run_pipelines_tf_gpu_test_reports
path: /transformers/reports/${{ matrix.machine_type }}_run_pipelines_tf_gpu_test_reports

run_examples_gpu:
if: ${{ inputs.job == 'run_examples_gpu' }}
name: Examples directory
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}']
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}

- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .

- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze

- name: Run examples tests on GPU
working-directory: /transformers
run: |
pip install -r examples/pytorch/_tests_requirements.txt
python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports/failures_short.txt

- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.machine_type }}_run_examples_gpu_test_reports
path: /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports

run_torch_cuda_extensions_gpu:
if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
name: Torch CUDA extension tests
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}']
container:
image: ${{ inputs.docker }}
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Update clone
working-directory: ${{ inputs.working-directory-prefix }}/transformers
run: git fetch && git checkout ${{ github.sha }}

- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: ${{ inputs.working-directory-prefix }}/transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .

- name: Update / Install some packages (for Past CI)
if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }}
working-directory: ${{ inputs.working-directory-prefix }}/transformers
run: |
python3 -m pip install -U datasets
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
- name: Remove cached torch extensions
run: rm -rf /github/home/.cache/torch_extensions/

# To avoid unknown test failures
- name: Pre build DeepSpeed *again* (for daily CI)
if: ${{ contains(inputs.ci_event, 'Daily CI') }}
working-directory: ${{ inputs.working-directory-prefix }}/
run: |
python3 -m pip uninstall -y deepspeed
DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
# To avoid unknown test failures
- name: Pre build DeepSpeed *again* (for nightly & Past CI)
if: ${{ contains(inputs.ci_event, 'Nightly CI') || contains(inputs.ci_event, 'Past CI') }}
working-directory: ${{ inputs.working-directory-prefix }}/
run: |
python3 -m pip uninstall -y deepspeed
rm -rf DeepSpeed
git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: ${{ inputs.working-directory-prefix }}/transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: ${{ inputs.working-directory-prefix }}/transformers
run: pip freeze

- name: Run all tests on GPU
working-directory: ${{ inputs.working-directory-prefix }}/transformers
run: |
python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat ${{ inputs.working-directory-prefix }}/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt

- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
path: ${{ inputs.working-directory-prefix }}/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports

run_quantization_torch_gpu:
if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
name: " "
needs: setup
strategy:
max-parallel: 4
fail-fast: false
matrix:
folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, '${{ inputs.runner }}']
container:
image: huggingface/transformers-quantization-latest-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Echo folder ${{ matrix.folders }}
shell: bash
run: |
echo "${{ matrix.folders }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}

- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .

- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze

- name: Run quantization tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt

- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
path: /transformers/reports/${{ matrix.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports

run_extract_warnings:
# Let's only do this for the job `run_models_gpu` to simplify the (already complex) logic.
if: ${{ always() && inputs.job == 'run_models_gpu' }}
name: Extract warnings in CI artifacts
runs-on: ubuntu-22.04
needs: [setup, run_models_gpu]
steps:
- name: Checkout transformers
uses: actions/checkout@v4
with:
fetch-depth: 2

- name: Install transformers
run: pip install transformers

- name: Show installed libraries and their versions
run: pip freeze

- name: Create output directory
run: mkdir warnings_in_ci

- uses: actions/download-artifact@v4
with:
path: warnings_in_ci

- name: Show artifacts
run: echo "$(python3 -c 'import os; d = os.listdir(); print(d)')"
working-directory: warnings_in_ci

- name: Extract warnings in CI artifacts
run: |
python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh
echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')"
- name: Upload artifact
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: warnings_in_ci
path: warnings_in_ci/selected_warnings.json

send_results:
name: Slack Report
needs: [
setup,
run_models_gpu,
run_pipelines_torch_gpu,
run_pipelines_tf_gpu,
run_examples_gpu,
run_torch_cuda_extensions_gpu,
run_quantization_torch_gpu,
run_extract_warnings
]
if: ${{ always() }}
uses: ./.github/workflows/slack-report.yml
with:
job: ${{ inputs.job }}
# This would be `skipped` if `setup` is skipped.
setup_status: ${{ needs.setup.result }}
slack_report_channel: ${{ inputs.slack_report_channel }}
# This would be an empty string if `setup` is skipped.
folder_slices: ${{ needs.setup.outputs.folder_slices }}
quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
ci_event: ${{ inputs.ci_event }}

secrets: inherit

0 comments on commit 924ead2

Please sign in to comment.