Skip to content

Commit

Permalink
Merge branch 'master' into torch_compile_micro_offset_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
tjruwase authored Oct 18, 2024
2 parents f3c80cb + 6eefc3d commit bc0da65
Show file tree
Hide file tree
Showing 149 changed files with 4,746 additions and 910 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/cpu-torch-latest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,5 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.4"
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.4"
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.5"
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.5"
1 change: 1 addition & 0 deletions .github/workflows/no-torch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ on:
pull_request:
paths:
- '.github/workflows/no-torch.yml'
- 'op_builder/**'
schedule:
- cron: "0 0 * * *"

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-accelerate-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu117, v100]
runs-on: [self-hosted, nvidia, cu121, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -29,7 +29,7 @@ jobs:

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-ds-chat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ permissions:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu117, v100]
runs-on: [self-hosted, nvidia, cu121, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -36,7 +36,7 @@ jobs:

- name: Install pytorch
run: |
pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu121
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/nv-inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu117, v100]
runs-on: [self-hosted, nvidia, cu121, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -32,7 +32,7 @@ jobs:

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu118
pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu121
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand All @@ -58,8 +58,8 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
#pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="2.1" --cuda_ver="11.8"
pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="2.1" --cuda_ver="11.8"
pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="2.1" --cuda_ver="11.8"
#pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="2.1" --cuda_ver="12.1"
pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="2.1" --cuda_ver="12.1"
pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="2.1" --cuda_ver="12.1"
# run ds_report again to check updated op list
ds_report
6 changes: 2 additions & 4 deletions .github/workflows/nv-lightning-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu111, v100]

env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions
runs-on: [self-hosted, nvidia, cu121, v100]

steps:
- uses: actions/checkout@v3
Expand All @@ -31,7 +29,7 @@ jobs:

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-mii.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu117, v100]
runs-on: [self-hosted, nvidia, cu121, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -37,7 +37,7 @@ jobs:

- name: Install pytorch
run: |
pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/nv-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ permissions:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu117, v100]
runs-on: [self-hosted, nvidia, cu121, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -28,7 +28,7 @@ jobs:

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down Expand Up @@ -58,7 +58,7 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.4" --cuda_ver="11.8"
pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.5" --cuda_ver="12.1"
- name: Open GitHub issue if nightly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/nv-sd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ jobs:
pip install image-similarity-measures
python -m pip install opencv-python==4.6.* --force-reinstall
python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
# Update packages included in the container that do not support pydantic 2+ to versions that do
python -m pip install thinc spacy confection --upgrade
python -m pip install .[dev,1bit,autotuning,sd]
ds_report
- name: Python environment
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/nv-torch-latest-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu117, v100]
runs-on: [self-hosted, nvidia, cu121, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -29,7 +29,7 @@ jobs:

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand All @@ -55,5 +55,5 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.4" --cuda_ver="11.8"
pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.4" --cuda_ver="11.8"
pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.5" --cuda_ver="12.1"
pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.5" --cuda_ver="12.1"
4 changes: 2 additions & 2 deletions .github/workflows/nv-torch-nightly-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ permissions:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu117, v100]
runs-on: [self-hosted, nvidia, cu121, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -25,7 +25,7 @@ jobs:

- name: Install pytorch
run: |
pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118
pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu121
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-transformers-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu117, v100]
runs-on: [self-hosted, nvidia, cu121, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -29,7 +29,7 @@ jobs:
- name: Install pytorch
run: |
# use the same pytorch version as transformers CI
pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu118 --index-url https://download.pytorch.org/whl/cu118
pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu121 --index-url https://download.pytorch.org/whl/cu121
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
65 changes: 65 additions & 0 deletions .github/workflows/xpu-compile.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
name: xpu-compile

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- ".github/workflows/xpu-compile.yml"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read
issues: write

jobs:
compile-tests:
runs-on: [self-hosted, intel, xpu]
container:
image: intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
ports:
- 80
options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL

steps:
- uses: actions/checkout@v4
- name: Install prerequisite
run: |
apt-get update
apt-get install clinfo libaio-dev python3-pip -y
pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/
pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/
pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/
pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/
pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl
pip install py-cpuinfo numpy
pip install .[dev,autotuning]
- name: Check container state
run: |
ldd --version
ds_report
python3 -c "import torch; print('torch:', torch.__version__, torch)"
python3 -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
pip list
- name: Compile Status
shell: bash
run: |
echo "# torch.compile graph breaks" >> $GITHUB_STEP_SUMMARY
export FI_HMEM=system
ulimit -n 1048575
cd tests/torch_compile
export ZE_AFFINITY_MASK=0,1
echo "## ZeRO stage 3" >> $GITHUB_STEP_SUMMARY
deepspeed test_compile.py --deepspeed_config ds_config_z3.json 2>&1 | tee log_z3.txt
# for each line start with 'dynamo_output', extract the second field and following fields and append to GITHUB_STEP_SUMMARY using awk
cat log_z3.txt | awk '/^dynamo_output/ {$1=""; print $0}' >> $GITHUB_STEP_SUMMARY
echo "## ZeRO stage 2" >> $GITHUB_STEP_SUMMARY
deepspeed test_compile.py --deepspeed_config ds_config_z2.json 2>&1 | tee log_z2.txt
cat log_z2.txt | awk '/^dynamo_output/ {$1=""; print $0}' >> $GITHUB_STEP_SUMMARY
76 changes: 38 additions & 38 deletions CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -7,50 +7,50 @@


# top-level repo folders
/.github/ @mrwyattii @loadams
/azure/ @mrwyattii @awan-10
/benchmarks/ @awan-10 @mrwyattii
/bin/ @mrwyattii
/csrc/ @awan-10 @mrwyattii @cmikeh2 @arashb
/deepspeed/ @mrwyattii
/docker/ @mrwyattii @awan-10
/docs/ @mrwyattii
/examples/ @awan-10 @mrwyattii
/op_builder/ @mrwyattii @cmikeh2
/release/ @loadams @mrwyattii
/requirements/ @loadams @mrwyattii
/scripts/ @mrwyattii @awan-10
/tests/ @mrwyattii @tjruwase @loadams
/.github/ @loadams
/azure/ @awan-10
/benchmarks/ @awan-10 @tjruwase
/bin/ @loadams
/csrc/ @awan-10
/deepspeed/ @loadams @tjruwase
/docker/ @awan-10
/docs/ @loadams @tjruwase
/examples/ @awan-10 @tohtana
/op_builder/ @loadams @tjruwase @jomayeri
/release/ @loadams
/requirements/ @loadams
/scripts/ @awan-10
/tests/ @tjruwase @loadams @tohtana

# deepspeed
/deepspeed/autotuning/ @mrwyattii
/deepspeed/autotuning/ @loadams
/deepspeed/checkpoint/ @tjruwase
/deepspeed/comm/ @awan-10
/deepspeed/compression/ @minjiaz @xiaoxiawu-microsoft @conglongli
/deepspeed/elasticity/ @mrwyattii @awan-10
/deepspeed/launcher/ @mrwyattii @awan-10
/deepspeed/module_inject/ @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/moe/ @awan-10
/deepspeed/monitor/ @awan-10 @mrwyattii
/deepspeed/nebula/ @tjruwase @mrwyattii
/deepspeed/ops/ @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/pipe/ @ShadenSmith @duli2012
/deepspeed/profiling/ @ShijieZZZZ
/deepspeed/utils/ @mrwyattii @tjruwase @awan-10
/deepspeed/compression/ @tjruwase
/deepspeed/elasticity/ @awan-10
/deepspeed/launcher/ @loadams
/deepspeed/module_inject/ @awan-10
/deepspeed/moe/ @tohtana
/deepspeed/monitor/ @awan-10
/deepspeed/nebula/ @tjruwase
/deepspeed/ops/ @tohtana
/deepspeed/pipe/ @tohtana @loadams
/deepspeed/profiling/ @loadams
/deepspeed/utils/ @tjruwase @awan-10

# inference
/deepspeed/inference/ @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/model_implementations/ @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/inference/ @awan-10
/deepspeed/model_implementations/ @awan-10

# training
/deepspeed/runtime/ @mrwyattii @tjruwase
/deepspeed/runtime/activation_checkpointing/ @mrwyattii @tjruwase
/deepspeed/runtime/checkpoint_engine/ @tjruwase @mrwyattii
/deepspeed/runtime/ @tjruwase @tohtana
/deepspeed/runtime/activation_checkpointing/ @tjruwase
/deepspeed/runtime/checkpoint_engine/ @tjruwase
/deepspeed/runtime/comm/ @awan-10
/deepspeed/runtime/compression/ @awan-10 @conglongli
/deepspeed/runtime/data_pipeline/ @conglongli
/deepspeed/runtime/fp16/ @mrwyattii @tjruwase
/deepspeed/runtime/fp16/onebit/ @conglongli @awan-10
/deepspeed/runtime/pipe/ @ShadenSmith @duli2012
/deepspeed/runtime/swap_tensor/ @tjruwase @mrwyattii
/deepspeed/runtime/zero/ @tjruwase @mrwyattii
/deepspeed/runtime/compression/ @awan-10
/deepspeed/runtime/data_pipeline/ @tjruwase
/deepspeed/runtime/fp16/ @tjruwase
/deepspeed/runtime/fp16/onebit/ @awan-10
/deepspeed/runtime/pipe/ @loadams
/deepspeed/runtime/swap_tensor/ @tjruwase
/deepspeed/runtime/zero/ @tjruwase
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
[![PyPI version](https://badge.fury.io/py/deepspeed.svg)](https://pypi.org/project/deepspeed/)
[![Downloads](https://static.pepy.tech/badge/deepspeed)](https://pepy.tech/project/deepspeed)
[![Build](https://badgen.net/badge/build/check-status/blue)](#build-pipeline-status)
[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/9530/badge)](https://www.bestpractices.dev/projects/9530)
[![Twitter](https://img.shields.io/twitter/follow/MSFTDeepSpeed)](https://twitter.com/intent/follow?screen_name=MSFTDeepSpeed)
[![Japanese Twitter](https://img.shields.io/badge/%E6%97%A5%E6%9C%AC%E8%AA%9ETwitter-%40MSFTDeepSpeedJP-blue)](https://twitter.com/MSFTDeepSpeedJP)
[![Chinese Zhihu](https://img.shields.io/badge/%E7%9F%A5%E4%B9%8E-%E5%BE%AE%E8%BD%AFDeepSpeed-blue)](https://www.zhihu.com/people/deepspeed)
Expand Down Expand Up @@ -141,6 +142,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram
| PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
| Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) |
| Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) |
| Huawei Ascend NPU | [![Huawei Ascend NPU](https://github.com/cosdt/DeepSpeed/actions/workflows/huawei-ascend-npu.yml/badge.svg?branch=master)](https://github.com/cosdt/DeepSpeed/actions/workflows/huawei-ascend-npu.yml) |

# Installation

Expand Down
Loading

0 comments on commit bc0da65

Please sign in to comment.