Skip to content

Commit

Permalink
Merge branch 'microsoft:master' into torch_compile_micro_offset_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
NirSonnenschein authored Dec 4, 2024
2 parents cc128fa + 0c6c981 commit 808aa45
Show file tree
Hide file tree
Showing 144 changed files with 2,792 additions and 502 deletions.
2 changes: 0 additions & 2 deletions .github/workflows/amd-mi200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,6 @@ jobs:
- name: Install (ROCm) apex
run: |
git clone https://github.com/ROCmSoftwarePlatform/apex.git
cd apex
git checkout torch_2.1_higher
CURRENT_VER=$(git rev-parse HEAD)
INSTALLED_VER=$(cat /blob/amd-apex/.venv_installed_version)
if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/cpu-inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/cpu-torch-latest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ concurrency:

jobs:
unit-tests:
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04

steps:
- uses: actions/checkout@v4
Expand Down
85 changes: 85 additions & 0 deletions .github/workflows/hpu-gaudi2-nightly.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
name: hpu-gaudi2-nightly

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- ".github/workflows/hpu-gaudi2-nightly.yml"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read
issues: write

jobs:
unit-tests:
# The type of runner that the job will run on
runs-on: [self-hosted, intel, gaudi2]
container:
image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
ports:
- 80
options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice

env:
PT_HPU_LAZY_MODE: 0
TORCHINDUCTOR_COMPILE_THREADS: 1
TEST_LIST: |
test_adamw.py
test_bf16.py
test_ds_config_dict.py
test_dynamic_loss_scale.py
test_latest_checkpoint.py
test_moe_checkpoint.py
test_multi_output_model.py
test_other_optimizer.py
test_pipe.py
test_pipeline.py
test_universal_checkpoint.py
test_zero_context_return.py
test_zero_leaf_module.py
test_zero_offloadpp.py
test_zero_tiled.py
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v4

- name: Check container state
run: |
ldd --version
hl-smi -L
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
pip install .
- name: Install deepspeed
run: |
pip install .[dev,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE}
export TORCHINDUCTOR_COMPILE_THREADS=${TORCHINDUCTOR_COMPILE_THREADS}
TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}')
echo "TEST_LIST ${TEST_LIST}"
pytest --verbose unit/ -k "${TEST_LIST}"
2 changes: 1 addition & 1 deletion .github/workflows/hpu-gaudi2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
# The type of runner that the job will run on
runs-on: [self-hosted, intel, gaudi2]
container:
image: vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
ports:
- 80
options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/no-torch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ permissions:

jobs:
unit-tests:
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04

steps:
- uses: actions/checkout@v4
Expand All @@ -30,6 +30,7 @@ jobs:
- name: Python environment
run: |
pip uninstall torch --yes
pip install setuptools
pip list
- name: Build deepspeed
Expand Down
8 changes: 3 additions & 5 deletions .github/workflows/nv-a6000.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
unit-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:23.03-py3
image: nvcr.io/nvidia/pytorch:24.03-py3
ports:
- 80
options: --gpus all --shm-size "8G"
Expand All @@ -47,8 +47,6 @@ jobs:
- name: Install deepspeed
run: |
python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
# Update packages included in the container that do not support pydantic 2+ to versions that do
python -m pip install thinc spacy confection --upgrade
python -m pip install .[dev,1bit,autotuning,inf]
ds_report
- name: Python environment
Expand All @@ -58,8 +56,8 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.0" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.0" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.3" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.3" --cuda_ver="12"
- name: MII unit tests
run: |
BRANCH="main"
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/nv-ds-chat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ on:
type: string
pull_request:
paths:
- ".github/workflows/nv-ds-chat.yml"
- "deepspeed/runtime/zero/stage_1_and_2.py"
- "deepspeed/runtime/zero/stage3.py"
- "deepspeed/runtime/hybrid_engine.py"
Expand Down Expand Up @@ -42,6 +43,7 @@ jobs:
- name: Install deepspeed
run: |
pip install transformers==4.45.2
pip install .[dev]
ds_report
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-human-eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
unit-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:23.03-py3
image: nvcr.io/nvidia/pytorch:24.03-py3
ports:
- 80
options: --gpus all --shm-size "8G"
Expand Down Expand Up @@ -50,4 +50,4 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.0" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.3" --cuda_ver="12"
2 changes: 1 addition & 1 deletion .github/workflows/nv-lightning-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
runs-on: [self-hosted, nvidia, cu121, v100]

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-pre-compile-ops.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ concurrency:

jobs:
unit-tests:
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
container:
image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116

Expand Down
6 changes: 2 additions & 4 deletions .github/workflows/nv-sd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
sd-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:23.03-py3
image: nvcr.io/nvidia/pytorch:24.03-py3
ports:
- 80
options: --gpus all --shm-size "8G"
Expand All @@ -53,8 +53,6 @@ jobs:
pip install image-similarity-measures
python -m pip install opencv-python==4.6.* --force-reinstall
python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
# Update packages included in the container that do not support pydantic 2+ to versions that do
python -m pip install thinc spacy confection --upgrade
python -m pip install .[dev,1bit,autotuning,sd]
ds_report
- name: Python environment
Expand All @@ -64,7 +62,7 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF -m 'stable_diffusion' -k "TestStableDiffusion" unit/ --torch_ver="2.0" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'stable_diffusion' -k "TestStableDiffusion" unit/ --torch_ver="2.3" --cuda_ver="12"
- name: Open GitHub issue if weekly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-torch110-p40.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-torch110-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ jobs:
unit-tests:
strategy:
matrix:
pyVersion: ["3.7", "3.8", "3.9", "3.10"]
pyVersion: ["3.8", "3.9", "3.10"]
fail-fast: false

runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
container:
image: deepspeed/gh-builder:py${{ matrix.pyVersion }}

Expand Down
13 changes: 6 additions & 7 deletions .github/workflows/xpu-max1100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
unit-tests:
runs-on: [self-hosted, intel, xpu]
container:
image: intel/oneapi-basekit:2024.1.1-devel-ubuntu22.04
image: intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
ports:
- 80
options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
Expand All @@ -47,12 +47,11 @@ jobs:
run: |
apt-get update
apt-get install clinfo libaio-dev python3-pip -y
pip install torch==2.1.0.post2 -f https://developer.intel.com/ipex-whl-stable-xpu
pip install intel-extension-for-pytorch==2.1.30+xpu -f https://developer.intel.com/ipex-whl-stable-xpu
pip install intel-extension-for-pytorch-deepspeed==2.1.30 -f https://developer.intel.com/ipex-whl-stable-xpu
pip install oneccl_bind_pt==2.1.300+xpu -f https://developer.intel.com/ipex-whl-stable-xpu
pip install torchvision==0.16.0.post2 -f https://developer.intel.com/ipex-whl-stable-xpu
pip install py-cpuinfo numpy==1.26
pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/
pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/
pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/
pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/
pip install py-cpuinfo numpy
pip install .[dev,autotuning]
- name: Check container state
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ repos:
- id: trailing-whitespace

- repo: https://github.com/google/yapf
rev: v0.32.0
rev: v0.40.0
hooks:
- id: yapf

Expand Down Expand Up @@ -65,7 +65,7 @@ repos:
]

- repo: https://github.com/pycqa/flake8
rev: 4.0.1
rev: 5.0.4
hooks:
- id: flake8
args: ['--config=.flake8']
Expand Down
9 changes: 9 additions & 0 deletions COMMITTERS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# DeepSpeed TSC Committers #

| Name | GitHub ID | Affiliation
|--- | ---- | --- |
| Olatunji Ruwase | [tjruwase](https://github.com/tjruwase) | Microsoft |
| Logan Adams | [loadams](https://github.com/loadams) | Microsoft |
| Masahiro Tanaka | [tohtana](https://github.com/tohtana) | Microsoft |
| Jeff Rasley | [jeffra](https://github.com/jeffra) | SnowFlake |
| Minjia Zhang | [minjiazhang](https://github.com/minjiazhang) | UIUC |
Loading

0 comments on commit 808aa45

Please sign in to comment.