From ca4bc66b5401eb3ecd46d56642fdc4eb658c1c21 Mon Sep 17 00:00:00 2001 From: vikram singh shekhawat Date: Tue, 9 Apr 2024 03:11:03 +0530 Subject: [PATCH 01/36] Increase CI coverage for Gaudi2 accelerator. (#5358) This change increase CI coverage for HPU accelerator. --------- Co-authored-by: Logan Adams Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- .github/workflows/hpu-gaudi2.yml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml index 1881c968b560..12d8ee661fa8 100644 --- a/.github/workflows/hpu-gaudi2.yml +++ b/.github/workflows/hpu-gaudi2.yml @@ -8,7 +8,23 @@ on: paths: - ".github/workflows/hpu-gaudi2.yml" - "accelerator/hpu_accelerator.py" - + - "op_builder/hpu/**" + - "deepspeed/runtime/engine.py" + - "deepspeed/runtime/bf16_optimizer.py" + - "deepspeed/runtime/zero/stage_1_and_2.py" + - "deepspeed/runtime/zero/stage3.py" + - "deepspeed/runtime/zero/partition_parameters.py" + - "deepspeed/runtime/zero/partitioned_param_coordinator.py" + - "deepspeed/runtime/zero/parameter_offload.py" + - "deepspeed/runtime/pipe/engine.py" + - "deepspeed/runtime/utils.py" + - "deepspeed/inference/engine.py" + - "deepspeed/module_inject/auto_tp.py" + - "deepspeed/module_inject/replace_module.py" + - "deepspeed/module_inject/load_checkpoint.py" + - "deepspeed/module_inject/inject.py" + - "deepspeed/ops/transformer/**" + - "deepspeed/ops/adam/**" concurrency: group: ${{ github.workflow }}-${{ github.ref }} From 873738897805f8bff31fd8145bf67ef9e9379095 Mon Sep 17 00:00:00 2001 From: Liangliang-Ma <1906710196@qq.com> Date: Wed, 10 Apr 2024 01:17:25 +0800 Subject: [PATCH 02/36] Add CI for Intel XPU/Max1100 (#5376) Add basic workflow for tests on intel xpu. Currently we have part of tests enabled. We will add more tests in later PRs. --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- .github/workflows/xpu-max1100.yml | 75 +++++++++++++++++++++++++++++++ tests/unit/util.py | 3 ++ 2 files changed, 78 insertions(+) create mode 100644 .github/workflows/xpu-max1100.yml diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml new file mode 100644 index 000000000000..078cd61801a4 --- /dev/null +++ b/.github/workflows/xpu-max1100.yml @@ -0,0 +1,75 @@ +name: xpu-max1100 + +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - ".github/workflows/xpu.yml" + - "accelerator/xpu_accelerator.py" + - "accelerator/abstract_accelerator.py" + - "accelerator/cpu_accelerator.py" + - "accelerator/real_accelerator.py" + - "deepspeed/runtime/engine.py" + - "deepspeed/runtime/bf16_optimizer.py" + - "deepspeed/runtime/zero/stage_1_and_2.py" + - "deepspeed/runtime/zero/stage3.py" + - "deepspeed/runtime/zero/partition_parameters.py" + - "deepspeed/runtime/zero/partitioned_param_coordinator.py" + - "deepspeed/runtime/zero/parameter_offload.py" + - "deepspeed/runtime/pipe/engine.py" + - "deepspeed/runtime/utils.py" + - "deepspeed/inference/**" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + issues: write + + +jobs: + unit-tests: + runs-on: [self-hosted, intel, xpu] + container: + image: intel/intel-extension-for-pytorch:2.1.20-xpu + ports: + - 80 + options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL + + steps: + - uses: actions/checkout@v3 + - name: Check container state + shell: bash + run: | + ldd --version + python -c "import torch; print('torch:', torch.__version__, torch)" + python -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())" + + - name: Install deepspeed + run: | + pip install py-cpuinfo + pip install .[dev,autotuning] + ds_report + python -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)" + + - name: Python environment + run: | + pip list + + - name: Unit tests + run: | + pip install pytest pytest-timeout tabulate + cd tests/unit + pytest --verbose accelerator/* + pytest --verbose autotuning/* + pytest --verbose checkpoint/test_reshape_checkpoint.py + pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py + pytest --verbose runtime/test_ds_config_model.py + pytest --verbose runtime/pipe/test_pipe_schedule.py + pytest --verbose runtime/zero/test_zero_config.py + pytest --verbose runtime/zero/test_zero_tiled.py + pytest --verbose runtime/zero/test_zeropp.py diff --git a/tests/unit/util.py b/tests/unit/util.py index e8e0f476371b..5acf580ed3c8 100644 --- a/tests/unit/util.py +++ b/tests/unit/util.py @@ -48,6 +48,7 @@ def bf16_required_version_check(accelerator_check=True): nccl_version_available = NCCL_MAJOR > 2 or (NCCL_MAJOR == 2 and NCCL_MINOR >= 10) npu_available = get_accelerator().device_name() == 'npu' hpu_available = get_accelerator().device_name() == 'hpu' + xpu_available = get_accelerator().device_name() == 'xpu' if torch_version_available and cuda_version_available and nccl_version_available and accelerator_pass: return True @@ -55,6 +56,8 @@ def bf16_required_version_check(accelerator_check=True): return True elif hpu_available: return True + elif xpu_available: + return True else: return False From 7e126d20b968db5e2a31f198bab6fd79ab33260b Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Tue, 9 Apr 2024 14:18:49 -0700 Subject: [PATCH 03/36] Update path name on xpu-max1100.yml, add badge in README (#5386) - Fixes xpu-max1100 not running on PR because of incorrect yml name. --- .github/workflows/xpu-max1100.yml | 3 +-- README.md | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml index 078cd61801a4..43a29598d12c 100644 --- a/.github/workflows/xpu-max1100.yml +++ b/.github/workflows/xpu-max1100.yml @@ -6,7 +6,7 @@ on: - cron: "0 0 * * *" pull_request: paths: - - ".github/workflows/xpu.yml" + - ".github/workflows/xpu-max1100.yml" - "accelerator/xpu_accelerator.py" - "accelerator/abstract_accelerator.py" - "accelerator/cpu_accelerator.py" @@ -20,7 +20,6 @@ on: - "deepspeed/runtime/zero/parameter_offload.py" - "deepspeed/runtime/pipe/engine.py" - "deepspeed/runtime/utils.py" - - "deepspeed/inference/**" concurrency: group: ${{ github.workflow }}-${{ github.ref }} diff --git a/README.md b/README.md index 434a0dd53586..5f092a56c117 100755 --- a/README.md +++ b/README.md @@ -133,6 +133,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram | AMD | [![amd-mi200](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml) | | CPU | [![torch-latest-cpu](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml) [![cpu-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml) | | Intel Gaudi | [![hpu-gaudi2](https://github.com/microsoft/DeepSpeed/actions/workflows/hpu-gaudi2.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/hpu-gaudi2.yml) | +| Intel XPU | [![xpu-max1100](https://github.com/microsoft/DeepSpeed/actions/workflows/xpu-max1100.yml/badge.svg)?branch=master](https://github.com/microsoft/DeepSpeed/actions/workflows/xpu-max1100.yml) | | PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) | | Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) | | Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) | From eb216a837b988caeef94c0dd6036e0d0f351d4f7 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 9 Apr 2024 16:18:24 -0700 Subject: [PATCH 04/36] Update XPU workflow too --- .github/workflows/xpu-max1100.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml index 43a29598d12c..f83c58dc7ff5 100644 --- a/.github/workflows/xpu-max1100.yml +++ b/.github/workflows/xpu-max1100.yml @@ -40,7 +40,7 @@ jobs: options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Check container state shell: bash run: | From 62ab18102036a9079e0fd569cb149d6bdbd64d7b Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 9 Apr 2024 16:18:59 -0700 Subject: [PATCH 05/36] Revert "Update XPU workflow too" This reverts commit eb216a837b988caeef94c0dd6036e0d0f351d4f7. --- .github/workflows/xpu-max1100.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml index f83c58dc7ff5..43a29598d12c 100644 --- a/.github/workflows/xpu-max1100.yml +++ b/.github/workflows/xpu-max1100.yml @@ -40,7 +40,7 @@ jobs: options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v3 - name: Check container state shell: bash run: | From 63029e8f5d1d248a47ad52c497899cbefffbfee9 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Tue, 9 Apr 2024 17:28:21 -0700 Subject: [PATCH 06/36] Update checkout action on workflows on ubuntu 20.04 (#5387) - Only workflows running Ubuntu 20.04 or later can be updated as the GLIBC that is needed for node 20+ can be updated now. - Workflows that aren't updated are running Ubuntu 18.04 or older, those will need to be moved to updated images shortly and will be updated later in the original PR, #5021 Sample warning that is resolved: ``` Node.js 16 actions are deprecated. Please update the following actions to use Node.js 20: actions/checkout@v3. For more information see: https://github.blog/changelog/2023-09-22-github-actions-transitioning-from-node-16-to-node-20/. ``` --- .github/workflows/amd-mi200.yml | 2 +- .github/workflows/cpu-torch-latest.yml | 2 +- .github/workflows/formatting.yml | 2 +- .github/workflows/hpu-gaudi2.yml | 2 +- .github/workflows/nv-a6000.yml | 2 +- .github/workflows/nv-accelerate-v100.yml | 2 +- .github/workflows/nv-ds-chat.yml | 2 +- .github/workflows/nv-h100.yml | 2 +- .github/workflows/nv-inference.yml | 2 +- .github/workflows/nv-mii.yml | 2 +- .github/workflows/nv-nightly.yml | 2 +- .github/workflows/nv-pre-compile-ops.yml | 2 +- .github/workflows/nv-sd.yml | 2 +- .github/workflows/nv-torch-latest-v100.yml | 2 +- .github/workflows/nv-torch-nightly-v100.yml | 2 +- .github/workflows/nv-transformers-v100.yml | 2 +- .github/workflows/release.yml | 2 +- .github/workflows/xpu-max1100.yml | 2 +- 18 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml index e4b938d8e078..00ff72ac8929 100644 --- a/.github/workflows/amd-mi200.yml +++ b/.github/workflows/amd-mi200.yml @@ -21,7 +21,7 @@ jobs: # Steps represent a sequence of tasks that will be executed as part of the job steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml index 5096de931be4..9c1ad02f75a6 100644 --- a/.github/workflows/cpu-torch-latest.yml +++ b/.github/workflows/cpu-torch-latest.yml @@ -22,7 +22,7 @@ jobs: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml index 88dfa34a0a2b..d2554b7c0038 100644 --- a/.github/workflows/formatting.yml +++ b/.github/workflows/formatting.yml @@ -21,7 +21,7 @@ jobs: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: environment run: | diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml index 12d8ee661fa8..a3c12d057cc9 100644 --- a/.github/workflows/hpu-gaudi2.yml +++ b/.github/workflows/hpu-gaudi2.yml @@ -99,7 +99,7 @@ jobs: # Steps represent a sequence of tasks that will be executed as part of the job steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Check container state run: | diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml index 960e0203919e..3ce406948432 100644 --- a/.github/workflows/nv-a6000.yml +++ b/.github/workflows/nv-a6000.yml @@ -29,7 +29,7 @@ jobs: options: --gpus all --shm-size "8G" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Check container state run: | diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml index 1fccbece2994..915493bb3183 100644 --- a/.github/workflows/nv-accelerate-v100.yml +++ b/.github/workflows/nv-accelerate-v100.yml @@ -22,7 +22,7 @@ jobs: runs-on: [self-hosted, nvidia, cu117, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml index f61637be7e0e..94571eb101bb 100644 --- a/.github/workflows/nv-ds-chat.yml +++ b/.github/workflows/nv-ds-chat.yml @@ -24,7 +24,7 @@ jobs: runs-on: [self-hosted, nvidia, cu117, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/nv-h100.yml b/.github/workflows/nv-h100.yml index 93f074787372..5574ce8aa634 100644 --- a/.github/workflows/nv-h100.yml +++ b/.github/workflows/nv-h100.yml @@ -23,7 +23,7 @@ jobs: options: --gpus all --shm-size "8G" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Check container state run: | diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml index 6b339f457802..f863226bfb95 100644 --- a/.github/workflows/nv-inference.yml +++ b/.github/workflows/nv-inference.yml @@ -25,7 +25,7 @@ jobs: runs-on: [self-hosted, nvidia, cu117, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml index 31379f7e758b..8452c138c717 100644 --- a/.github/workflows/nv-mii.yml +++ b/.github/workflows/nv-mii.yml @@ -30,7 +30,7 @@ jobs: runs-on: [self-hosted, nvidia, cu117, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml index ca091990cf4b..b1e8c042214f 100644 --- a/.github/workflows/nv-nightly.yml +++ b/.github/workflows/nv-nightly.yml @@ -18,7 +18,7 @@ jobs: runs-on: [self-hosted, nvidia, cu117, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml index 6e308242ecf0..6afc11fddaab 100644 --- a/.github/workflows/nv-pre-compile-ops.yml +++ b/.github/workflows/nv-pre-compile-ops.yml @@ -26,7 +26,7 @@ jobs: image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: environment run: | diff --git a/.github/workflows/nv-sd.yml b/.github/workflows/nv-sd.yml index 0af9517c5b59..b348d5ff931f 100644 --- a/.github/workflows/nv-sd.yml +++ b/.github/workflows/nv-sd.yml @@ -33,7 +33,7 @@ jobs: options: --gpus all --shm-size "8G" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Check container state run: | diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml index 14d33680521d..3ca8ac43dfa4 100644 --- a/.github/workflows/nv-torch-latest-v100.yml +++ b/.github/workflows/nv-torch-latest-v100.yml @@ -22,7 +22,7 @@ jobs: runs-on: [self-hosted, nvidia, cu117, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml index bd13047f6078..257040439005 100644 --- a/.github/workflows/nv-torch-nightly-v100.yml +++ b/.github/workflows/nv-torch-nightly-v100.yml @@ -18,7 +18,7 @@ jobs: runs-on: [self-hosted, nvidia, cu117, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml index 75f53c95c235..cfed6d6583e6 100644 --- a/.github/workflows/nv-transformers-v100.yml +++ b/.github/workflows/nv-transformers-v100.yml @@ -21,7 +21,7 @@ jobs: runs-on: [self-hosted, nvidia, cu117, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8e016b4169cb..5a931125eff6 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -11,7 +11,7 @@ jobs: environment: release-env steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: ref: "master" - id: setup-venv diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml index 43a29598d12c..f83c58dc7ff5 100644 --- a/.github/workflows/xpu-max1100.yml +++ b/.github/workflows/xpu-max1100.yml @@ -40,7 +40,7 @@ jobs: options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Check container state shell: bash run: | From 6dcced1d5c997876e9a1279f79ec8f9339561846 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Wed, 10 Apr 2024 08:39:24 -0700 Subject: [PATCH 07/36] Cleanup required_torch_version code and references. (#5370) - Move `required_torch_version` check from deepspeed.runtime.utils to deepspeed.utils.torch (newly created). - Remove unused duplicate definition from `tests/unit/util.py`. - Update all references to this function. - Switch checks in `deepspeed/runtime/pipe/p2p.py` to use this function. - Switch checks in `deepspeed/comm/torch.py` to use this function. --------- Co-authored-by: Lev Kurilenko <113481193+lekurile@users.noreply.github.com> --- deepspeed/comm/torch.py | 28 +++---------------- deepspeed/elasticity/utils.py | 2 +- deepspeed/runtime/comm/nccl.py | 2 +- deepspeed/runtime/fp16/fused_optimizer.py | 3 +- deepspeed/runtime/fp16/onebit/adam.py | 2 +- deepspeed/runtime/fp16/onebit/lamb.py | 2 +- deepspeed/runtime/fp16/onebit/zoadam.py | 2 +- deepspeed/runtime/fp16/unfused_optimizer.py | 3 +- deepspeed/runtime/pipe/p2p.py | 8 ++---- deepspeed/runtime/utils.py | 15 ---------- deepspeed/utils/torch.py | 22 +++++++++++++++ tests/unit/alexnet_model.py | 2 +- tests/unit/checkpoint/test_mics_optimizer.py | 2 +- tests/unit/checkpoint/test_moe_checkpoint.py | 2 +- .../checkpoint/test_universal_checkpoint.py | 2 +- tests/unit/checkpoint/test_zero_optimizer.py | 2 +- tests/unit/compression/test_compression.py | 2 +- .../quantization/test_intX_quantization.py | 2 +- .../test_configurable_parallel_mp.py | 2 +- .../test_configurable_parallel_pp.py | 2 +- tests/unit/moe/test_moe.py | 2 +- tests/unit/moe/test_moe_tp.py | 2 +- .../flops_profiler/test_flops_profiler.py | 2 +- .../runtime/compile/test_compile_wrapper.py | 2 +- .../unit/runtime/compile/test_compile_zero.py | 2 +- .../unit/runtime/compile/test_load_config.py | 2 +- .../half_precision/onebit/test_onebit.py | 2 +- .../unit/runtime/half_precision/test_fp16.py | 2 +- tests/unit/runtime/test_ds_initialize.py | 3 +- tests/unit/util.py | 15 ---------- 30 files changed, 56 insertions(+), 85 deletions(-) create mode 100644 deepspeed/utils/torch.py diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py index d96c63d84e60..35c31f9fe2ec 100644 --- a/deepspeed/comm/torch.py +++ b/deepspeed/comm/torch.py @@ -9,6 +9,7 @@ from .backend import * from .comm import * from ..runtime import compiler +from deepspeed.utils.torch import required_torch_version import os DS_COMM_ALL_GATHER_OFF = False @@ -18,40 +19,19 @@ DS_COMM_REDUCE_OFF = False -def is_torch_ver_eq_2_0(): - TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2]) - if TORCH_MAJOR == 2 and TORCH_MINOR == 0: - return True - return False - - -def is_torch_ver_ge_2_1(): - TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2]) - if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1: - return True - return False - - -def torch_ver_ge_1_13(): - TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2]) - if TORCH_MAJOR >= 1 and TORCH_MINOR >= 13: - return True - return False - - def has_coalescing_manager(): has_c10d = hasattr(torch.distributed, 'distributed_c10d') return has_c10d and hasattr(torch.distributed.distributed_c10d, '_coalescing_manager') def has_all_reduce_coalesced(): - return hasattr(torch.distributed, "all_reduce_coalesced") and torch_ver_ge_1_13() + return hasattr(torch.distributed, "all_reduce_coalesced") and required_torch_version(min_version=1.13) def get_coalescing_manager(group, device, reqs, async_op): - if is_torch_ver_eq_2_0(): + if required_torch_version(min_version=2.0, max_version=2.0): return torch.distributed.distributed_c10d._coalescing_manager(group, device=device, reqs=reqs) - elif is_torch_ver_ge_2_1(): + elif required_torch_version(min_version=2.1): return torch.distributed.distributed_c10d._coalescing_manager(group, device=device, async_ops=async_op) else: return torch.distributed.distributed_c10d._coalescing_manager(group, reqs) diff --git a/deepspeed/elasticity/utils.py b/deepspeed/elasticity/utils.py index 848ea8646449..78ae0352cf6e 100644 --- a/deepspeed/elasticity/utils.py +++ b/deepspeed/elasticity/utils.py @@ -3,7 +3,7 @@ # DeepSpeed Team -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version def is_torch_elastic_compatible(): diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py index 231f841ae8b7..a57b7519a295 100644 --- a/deepspeed/runtime/comm/nccl.py +++ b/deepspeed/runtime/comm/nccl.py @@ -9,7 +9,7 @@ import numpy as np from deepspeed.runtime.compression.cupy import CupyBackend -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed.accelerator import get_accelerator diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py index 9ed250252e17..af8050c4646a 100755 --- a/deepspeed/runtime/fp16/fused_optimizer.py +++ b/deepspeed/runtime/fp16/fused_optimizer.py @@ -11,9 +11,10 @@ from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer -from deepspeed.runtime.utils import get_global_norm, get_grad_norm, CheckOverflow, get_weight_norm, required_torch_version, get_norm_with_moe_layers +from deepspeed.runtime.utils import get_global_norm, get_grad_norm, CheckOverflow, get_weight_norm, get_norm_with_moe_layers from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE from deepspeed.utils import logger, log_dist +from deepspeed.utils.torch import required_torch_version from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, CLIP_GRAD from deepspeed.accelerator import get_accelerator from deepspeed.moe.utils import is_moe_param_group diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py index ae3e5f573850..f8a50393ac5d 100644 --- a/deepspeed/runtime/fp16/onebit/adam.py +++ b/deepspeed/runtime/fp16/onebit/adam.py @@ -7,7 +7,7 @@ import torch import numpy as np from deepspeed.accelerator import get_accelerator -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed import comm as dist diff --git a/deepspeed/runtime/fp16/onebit/lamb.py b/deepspeed/runtime/fp16/onebit/lamb.py index 9cd2e0f25648..0f70782fd3ff 100644 --- a/deepspeed/runtime/fp16/onebit/lamb.py +++ b/deepspeed/runtime/fp16/onebit/lamb.py @@ -7,7 +7,7 @@ import torch import numpy as np from deepspeed import comm as dist -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors from deepspeed.accelerator import get_accelerator diff --git a/deepspeed/runtime/fp16/onebit/zoadam.py b/deepspeed/runtime/fp16/onebit/zoadam.py index 9ef671e7e3b7..bd75ccd4f7a0 100644 --- a/deepspeed/runtime/fp16/onebit/zoadam.py +++ b/deepspeed/runtime/fp16/onebit/zoadam.py @@ -7,7 +7,7 @@ import torch import numpy as np from deepspeed.accelerator import get_accelerator -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed import comm as dist diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py index a7fd1910d7b2..530355f846e2 100755 --- a/deepspeed/runtime/fp16/unfused_optimizer.py +++ b/deepspeed/runtime/fp16/unfused_optimizer.py @@ -12,9 +12,10 @@ from torch._utils import _flatten_dense_tensors from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer -from deepspeed.runtime.utils import get_global_norm, CheckOverflow, get_weight_norm, required_torch_version +from deepspeed.runtime.utils import get_global_norm, CheckOverflow, get_weight_norm from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE from deepspeed.utils import logger +from deepspeed.utils.torch import required_torch_version from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT from deepspeed.accelerator import get_accelerator from deepspeed import comm as dist diff --git a/deepspeed/runtime/pipe/p2p.py b/deepspeed/runtime/pipe/p2p.py index 31b9a14bf0f4..2b12a9573c4b 100644 --- a/deepspeed/runtime/pipe/p2p.py +++ b/deepspeed/runtime/pipe/p2p.py @@ -9,9 +9,7 @@ import torch from deepspeed import comm as dist -# To query whether we have send/recv support -from packaging.version import Version -from deepspeed.git_version_info import torch_info +from deepspeed.utils.torch import required_torch_version from deepspeed.accelerator import get_accelerator _groups = None @@ -21,9 +19,7 @@ def can_send_recv() -> bool: - torch_version = Version(torch_info['version']) - sendrecv_min = Version('1.8') - return torch_version >= sendrecv_min + return required_torch_version(min_version=1.8) #initializes adjacent process groups diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py index 9d561f7271eb..c55f8a0e2995 100755 --- a/deepspeed/runtime/utils.py +++ b/deepspeed/runtime/utils.py @@ -14,7 +14,6 @@ import psutil import gc from math import sqrt -from packaging import version as pkg_version import torch from deepspeed import comm as dist @@ -1036,20 +1035,6 @@ def get_inactive_params(param_list): param.ds_status == ZeroParamStatus.NOT_AVAILABLE)] -def required_torch_version(min_version=None, max_version=None): - assert min_version or max_version, "Must provide a min_version or max_version argument" - - torch_version = pkg_version.parse(torch.__version__) - - if min_version and pkg_version.parse(str(min_version)) > torch_version: - return False - - if max_version and pkg_version.parse(str(max_version)) < torch_version: - return False - - return True - - def get_norm_with_moe_layers(non_expert_norm, mpu, expert_tensors, norm_type=2): """ Compute the global norm with MoE experts diff --git a/deepspeed/utils/torch.py b/deepspeed/utils/torch.py new file mode 100644 index 000000000000..eb22d3561035 --- /dev/null +++ b/deepspeed/utils/torch.py @@ -0,0 +1,22 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from packaging import version as pkg_version + +import torch + + +def required_torch_version(min_version=None, max_version=None): + assert min_version or max_version, "Must provide a min_version or max_version argument" + + torch_version = pkg_version.parse(torch.__version__) + + if min_version and pkg_version.parse(str(min_version)) > torch_version: + return False + + if max_version and pkg_version.parse(str(max_version)) < torch_version: + return False + + return True diff --git a/tests/unit/alexnet_model.py b/tests/unit/alexnet_model.py index cf533063d6ec..25256d376eeb 100644 --- a/tests/unit/alexnet_model.py +++ b/tests/unit/alexnet_model.py @@ -11,7 +11,7 @@ import deepspeed import deepspeed.comm as dist import deepspeed.runtime.utils as ds_utils -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed.accelerator import get_accelerator from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec diff --git a/tests/unit/checkpoint/test_mics_optimizer.py b/tests/unit/checkpoint/test_mics_optimizer.py index 3f853cd5c13a..9e56bf3446fa 100644 --- a/tests/unit/checkpoint/test_mics_optimizer.py +++ b/tests/unit/checkpoint/test_mics_optimizer.py @@ -8,7 +8,7 @@ import deepspeed -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from unit.common import DistributedTest from unit.simple_model import * from unit.checkpoint.common import * diff --git a/tests/unit/checkpoint/test_moe_checkpoint.py b/tests/unit/checkpoint/test_moe_checkpoint.py index 36efe2a69002..89878b5d8fa9 100644 --- a/tests/unit/checkpoint/test_moe_checkpoint.py +++ b/tests/unit/checkpoint/test_moe_checkpoint.py @@ -4,7 +4,7 @@ # DeepSpeed Team from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from unit.common import DistributedTest from unit.simple_model import * diff --git a/tests/unit/checkpoint/test_universal_checkpoint.py b/tests/unit/checkpoint/test_universal_checkpoint.py index 7adfe8410b55..e0c4f4745043 100644 --- a/tests/unit/checkpoint/test_universal_checkpoint.py +++ b/tests/unit/checkpoint/test_universal_checkpoint.py @@ -7,7 +7,7 @@ from types import SimpleNamespace from torch.utils._pytree import tree_map -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed.checkpoint import UNIVERSAL_CHECKPOINT_INFO from deepspeed.checkpoint.ds_to_universal import main as convert_to_universal diff --git a/tests/unit/checkpoint/test_zero_optimizer.py b/tests/unit/checkpoint/test_zero_optimizer.py index 2312425c8aed..84b4eca6e2ca 100644 --- a/tests/unit/checkpoint/test_zero_optimizer.py +++ b/tests/unit/checkpoint/test_zero_optimizer.py @@ -8,7 +8,7 @@ from deepspeed.ops.op_builder import CPUAdamBuilder from deepspeed.checkpoint.utils import clone_tensors_for_torch_save, get_model_ckpt_name_for_rank from deepspeed.accelerator import get_accelerator -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from unit.common import DistributedTest, DistributedFixture from unit.simple_model import * diff --git a/tests/unit/compression/test_compression.py b/tests/unit/compression/test_compression.py index c6e5031349cb..1802c09f33b5 100644 --- a/tests/unit/compression/test_compression.py +++ b/tests/unit/compression/test_compression.py @@ -14,7 +14,7 @@ from deepspeed.compression.basic_layer import LinearLayer_Compress, ColumnParallelLinear_Compress, RowParallelLinear_Compress from deepspeed.compression.helper import convert_conv1d_to_linear from deepspeed.accelerator import get_accelerator -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from unit.common import DistributedTest pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5), diff --git a/tests/unit/inference/quantization/test_intX_quantization.py b/tests/unit/inference/quantization/test_intX_quantization.py index fd6a8e5ad2e1..77b51fcd5814 100644 --- a/tests/unit/inference/quantization/test_intX_quantization.py +++ b/tests/unit/inference/quantization/test_intX_quantization.py @@ -11,7 +11,7 @@ from deepspeed.inference.quantization.quantization import _init_group_wise_weight_quantization from deepspeed.inference.quantization.utils import Quantizer, DeQuantizer from deepspeed.inference.quantization.layers import QuantizedLinear -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from transformers.models.opt.modeling_opt import OPTDecoderLayer from transformers import AutoConfig, OPTConfig, AutoModel import pytest diff --git a/tests/unit/model_parallelism/test_configurable_parallel_mp.py b/tests/unit/model_parallelism/test_configurable_parallel_mp.py index 824ecea5f144..cca1ef3584ad 100644 --- a/tests/unit/model_parallelism/test_configurable_parallel_mp.py +++ b/tests/unit/model_parallelism/test_configurable_parallel_mp.py @@ -13,7 +13,7 @@ from deepspeed.accelerator import get_accelerator from unit.common import DistributedTest, DistributedFixture from unit.megatron_model import get_gpt2_model, get_megatron_version -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5, max_version=1.13), reason='Megatron-LM package requires Pytorch version >=1.5 and <=1.13') diff --git a/tests/unit/model_parallelism/test_configurable_parallel_pp.py b/tests/unit/model_parallelism/test_configurable_parallel_pp.py index b500b9d857a5..e50fd18577b1 100644 --- a/tests/unit/model_parallelism/test_configurable_parallel_pp.py +++ b/tests/unit/model_parallelism/test_configurable_parallel_pp.py @@ -15,7 +15,7 @@ from unit.megatron_model import MockGPT2ModelPipe as GPT2ModelPipe from deepspeed.utils import RepeatingLoader from deepspeed.accelerator import get_accelerator -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5, max_version=1.13), reason='Megatron-LM package requires Pytorch version >=1.5 and <=1.13') diff --git a/tests/unit/moe/test_moe.py b/tests/unit/moe/test_moe.py index 0ec45933ff66..d39f9fe3d651 100644 --- a/tests/unit/moe/test_moe.py +++ b/tests/unit/moe/test_moe.py @@ -13,7 +13,7 @@ from deepspeed import get_accelerator from deepspeed.moe.sharded_moe import top1gating from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer, is_moe_param -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version @pytest.mark.parametrize("zero_stage", [0, 1, 2]) diff --git a/tests/unit/moe/test_moe_tp.py b/tests/unit/moe/test_moe_tp.py index 0069c674690c..eb4668015c01 100644 --- a/tests/unit/moe/test_moe_tp.py +++ b/tests/unit/moe/test_moe_tp.py @@ -7,7 +7,7 @@ import deepspeed import pytest from unit.common import DistributedTest -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed.moe.layer import MoE diff --git a/tests/unit/profiling/flops_profiler/test_flops_profiler.py b/tests/unit/profiling/flops_profiler/test_flops_profiler.py index bbcb01b489f4..c72deecf287f 100644 --- a/tests/unit/profiling/flops_profiler/test_flops_profiler.py +++ b/tests/unit/profiling/flops_profiler/test_flops_profiler.py @@ -9,7 +9,7 @@ from deepspeed.profiling.flops_profiler import get_model_profile from unit.simple_model import SimpleModel, random_dataloader from unit.common import DistributedTest -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed.accelerator import get_accelerator if torch.half not in get_accelerator().supported_dtypes(): diff --git a/tests/unit/runtime/compile/test_compile_wrapper.py b/tests/unit/runtime/compile/test_compile_wrapper.py index 0bebeed117b4..d1830534f6ea 100644 --- a/tests/unit/runtime/compile/test_compile_wrapper.py +++ b/tests/unit/runtime/compile/test_compile_wrapper.py @@ -8,7 +8,7 @@ import deepspeed from deepspeed.accelerator import get_accelerator -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from unit.common import DistributedTest diff --git a/tests/unit/runtime/compile/test_compile_zero.py b/tests/unit/runtime/compile/test_compile_zero.py index 79ab5efd5099..7568c27e3ed2 100644 --- a/tests/unit/runtime/compile/test_compile_zero.py +++ b/tests/unit/runtime/compile/test_compile_zero.py @@ -7,7 +7,7 @@ import torch from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed.accelerator import get_accelerator from unit.runtime.compile.util import compare_loss diff --git a/tests/unit/runtime/compile/test_load_config.py b/tests/unit/runtime/compile/test_load_config.py index f3c53ede91fd..601adae58884 100644 --- a/tests/unit/runtime/compile/test_load_config.py +++ b/tests/unit/runtime/compile/test_load_config.py @@ -9,7 +9,7 @@ from unit.simple_model import SimpleModel import deepspeed from deepspeed.accelerator import get_accelerator -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from unit.common import DistributedTest diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py index 32ee262f3714..1be2d73ef4c0 100644 --- a/tests/unit/runtime/half_precision/onebit/test_onebit.py +++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py @@ -17,7 +17,7 @@ from unit.common import DistributedTest from unit.simple_model import SimpleModel, random_dataloader from unit.alexnet_model import AlexNetPipe, train_cifar -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed.accelerator import get_accelerator PipeTopo = PipeDataParallelTopology diff --git a/tests/unit/runtime/half_precision/test_fp16.py b/tests/unit/runtime/half_precision/test_fp16.py index 9229794b39f8..5b300053d2a8 100644 --- a/tests/unit/runtime/half_precision/test_fp16.py +++ b/tests/unit/runtime/half_precision/test_fp16.py @@ -10,7 +10,7 @@ from deepspeed.ops.adam import FusedAdam from unit.common import DistributedTest from unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader, SimpleMoEModel, sequence_dataloader -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import CPUAdamBuilder from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer diff --git a/tests/unit/runtime/test_ds_initialize.py b/tests/unit/runtime/test_ds_initialize.py index 1f3cc991eba7..169096a6d4e5 100644 --- a/tests/unit/runtime/test_ds_initialize.py +++ b/tests/unit/runtime/test_ds_initialize.py @@ -17,7 +17,8 @@ from deepspeed.ops.adam import FusedAdam from deepspeed.runtime.lr_schedules import WARMUP_LR, WarmupLR from deepspeed.runtime.config import ADAM_OPTIMIZER -from deepspeed.runtime.utils import see_memory_usage, required_torch_version +from deepspeed.runtime.utils import see_memory_usage +from deepspeed.utils.torch import required_torch_version from deepspeed.accelerator import get_accelerator diff --git a/tests/unit/util.py b/tests/unit/util.py index 5acf580ed3c8..feec326ede6c 100644 --- a/tests/unit/util.py +++ b/tests/unit/util.py @@ -7,7 +7,6 @@ import torch from deepspeed.accelerator import get_accelerator, is_current_accelerator_supported from deepspeed.git_version_info import torch_info -from packaging import version as pkg_version def skip_on_arch(min_arch=7): @@ -62,20 +61,6 @@ def bf16_required_version_check(accelerator_check=True): return False -def required_torch_version(min_version=None, max_version=None): - assert min_version or max_version, "Must provide a min_version or max_version argument" - - torch_version = pkg_version.parse(torch.__version__) - - if min_version and pkg_version.parse(str(min_version)) > torch_version: - return False - - if max_version and pkg_version.parse(str(max_version)) < torch_version: - return False - - return True - - def required_amp_check(): from importlib.util import find_spec if find_spec('apex') is None: From cc9e7b9c79ae48a7e93b9089e264eafb919178ab Mon Sep 17 00:00:00 2001 From: Liangliang-Ma <1906710196@qq.com> Date: Thu, 11 Apr 2024 00:46:00 +0800 Subject: [PATCH 08/36] Update README.md for intel XPU support (#5389) 1. fix a type error 2. update the intel xpu HW support status --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5f092a56c117..a1335caa4949 100755 --- a/README.md +++ b/README.md @@ -133,7 +133,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram | AMD | [![amd-mi200](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml) | | CPU | [![torch-latest-cpu](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml) [![cpu-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml) | | Intel Gaudi | [![hpu-gaudi2](https://github.com/microsoft/DeepSpeed/actions/workflows/hpu-gaudi2.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/hpu-gaudi2.yml) | -| Intel XPU | [![xpu-max1100](https://github.com/microsoft/DeepSpeed/actions/workflows/xpu-max1100.yml/badge.svg)?branch=master](https://github.com/microsoft/DeepSpeed/actions/workflows/xpu-max1100.yml) | +| Intel XPU | [![xpu-max1100](https://github.com/microsoft/DeepSpeed/actions/workflows/xpu-max1100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/xpu-max1100.yml) | | PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) | | Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) | | Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) | @@ -163,7 +163,7 @@ dynamically link them at runtime. | ----------- | -------- | ---------------- | --------------------- | ------------------ | | Intel | Intel(R) Gaudi(R) 2 AI accelerator | hpu | Yes | Yes | | Intel | Intel(R) Xeon(R) Processors | cpu | Yes | Yes | -| Intel | Intel(R) Data Center GPU Max series | xpu | Yes | No | +| Intel | Intel(R) Data Center GPU Max series | xpu | Yes | Yes | ## PyPI We regularly push releases to [PyPI](https://pypi.org/project/deepspeed/) and encourage users to install from there in most cases. From a8b821535aa0b254efa681d51b4951734ca021cc Mon Sep 17 00:00:00 2001 From: Reza Yazdani <44502768+RezaYazdaniAminabadi@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:50:47 -0700 Subject: [PATCH 09/36] Optimize the fp-dequantizer to get high memory-BW utilization (#5373) This PR removes the for loop inside the dequantizer kernel and use as many threads and blocks as needed to dequantize the quantized matrix. The previous implementation was processing each group per thread block which can reduce the efficiency when have having smaller group-size and also processes more data per-thread which is unnecessary and we can use more parallelism to improve the dequantization performance. Based on my testing results, for a 4K by 4K matrix, dequantizing from fp8 to bf16 gives 2.5x speedup (improving the BW efficiency from 1 TB/s to 2.5 TB/s on Nvidia H100 GPU). --------- Co-authored-by: Reza Yazdani --- csrc/fp_quantizer/quantize.cu | 168 ++++++++++++++-------------------- 1 file changed, 68 insertions(+), 100 deletions(-) diff --git a/csrc/fp_quantizer/quantize.cu b/csrc/fp_quantizer/quantize.cu index 37be6cc0657c..5f0b58f124f0 100644 --- a/csrc/fp_quantizer/quantize.cu +++ b/csrc/fp_quantizer/quantize.cu @@ -219,119 +219,100 @@ __global__ void apply_quantization(T* val, } template -__global__ void apply_dequantization(uint8_t* val, T* q_val, int group_size) +__global__ void apply_dequantization(uint8_t* val, T* q_val, int group_size, int total_num_elements) { - int tidx = threadIdx.x; - int wid = tidx >> 5; - int lane = tidx & 0x1f; - int gid = blockIdx.x * quantization::warps + wid; + constexpr uint32_t vector_size = quantization::access_granularity / sizeof(T); + int tidx = (blockIdx.x * blockDim.x + threadIdx.x) * vector_size; + constexpr int quantized_bits = _mantisa_bits + _exponent_bits + 1; constexpr int q_exponent_bits = total_q_bits - q_mantisa_bits - 1; constexpr uint16_t _mantisa_mask = (1 << _mantisa_bits) - 1; constexpr uint16_t _exponent_mask = ((1 << _exponent_bits) - 1) << _mantisa_bits; constexpr uint16_t _sign_mask = 1 << (_mantisa_bits + _exponent_bits); - - constexpr uint32_t vector_size = quantization::access_granularity / sizeof(T); - constexpr uint32_t load_stride = vector_size * hw_warp_size; - const uint32_t thread_offset = lane * vector_size; - const uint32_t thread_load_offset = lane * vector_size * quantized_bits / 8; - const uint32_t base_load_offset = - gid * (group_size * quantized_bits / 8 + 4) + thread_load_offset; // 4-byte scale offset - const uint32_t base_store_offset = gid * group_size + thread_offset; - const uint8_t* load_base_ptr = val + base_load_offset; + const uint32_t g_index = (tidx / group_size); + const uint32_t group_size_bytes = (group_size * quantized_bits / 8); + const uint8_t* load_base_ptr = + val + g_index * (group_size_bytes + 4) + (tidx % group_size) * quantized_bits / 8; int mantisa_mask = ((1 << q_mantisa_bits) - 1); mantisa_mask <<= (_mantisa_bits - q_mantisa_bits); - T* store_base_ptr = q_val + base_store_offset; - float scale; //= q_scale[gid]; + T* store_base_ptr = q_val + tidx; + float scale; uint8_t* scale_as_int8 = reinterpret_cast(&scale); if (quantized_bits == 6) { mem_access::load_global( - scale_as_int8, - val + gid * (group_size * quantized_bits / 8 + 4) + (group_size * quantized_bits / 8)); + scale_as_int8, val + g_index * (group_size_bytes + 4) + group_size_bytes); mem_access::load_global( scale_as_int8 + quantization::quanitzed_access_granularity_6bits, - val + gid * (group_size * quantized_bits / 8 + 4) + (group_size * quantized_bits / 8) + + val + g_index * (group_size_bytes + 4) + group_size_bytes + quantization::quanitzed_access_granularity_6bits); } else mem_access::load_global( - scale_as_int8, - val + gid * (group_size * quantized_bits / 8 + 4) + (group_size * quantized_bits / 8)); - -#pragma unroll - for (int i = 0; i < unroll; i++) { - if (i * load_stride + thread_offset < group_size) { - uint64_t q_buf_in; - uint64_t q_buf_in1; - uint8_t* int8_data = reinterpret_cast(&q_buf_in); - uint8_t* int8_data1 = reinterpret_cast(&q_buf_in1); - uint32_t loading_offset = i * load_stride * quantized_bits / 8; - if (quantized_bits == 6) { - mem_access::load_global( - int8_data, load_base_ptr + loading_offset); - mem_access::load_global( - int8_data + quantization::quanitzed_access_granularity_6bits, - load_base_ptr + loading_offset + - quantization::quanitzed_access_granularity_6bits); - mem_access::load_global( - int8_data + quantization::quanitzed_access_granularity_6bits * 2, - load_base_ptr + loading_offset + - quantization::quanitzed_access_granularity_6bits * 2); - } else { + scale_as_int8, val + g_index * (group_size_bytes + 4) + group_size_bytes); + + if (tidx < total_num_elements) { + uint64_t q_buf_in; + uint64_t q_buf_in1; + uint8_t* int8_data = reinterpret_cast(&q_buf_in); + uint8_t* int8_data1 = reinterpret_cast(&q_buf_in1); + if (quantized_bits == 6) { + mem_access::load_global( + int8_data, load_base_ptr); + mem_access::load_global( + int8_data + quantization::quanitzed_access_granularity_6bits, + load_base_ptr + quantization::quanitzed_access_granularity_6bits); + mem_access::load_global( + int8_data + quantization::quanitzed_access_granularity_6bits * 2, + load_base_ptr + quantization::quanitzed_access_granularity_6bits * 2); + } else { + mem_access::load_global(int8_data, + load_base_ptr); + if (quantized_bits > 4) { mem_access::load_global( - int8_data, load_base_ptr + loading_offset); - if (quantized_bits > 4) { + int8_data + quantization::quanitzed_access_granularity, + load_base_ptr + quantization::quanitzed_access_granularity); + if (quantized_bits == 12) { mem_access::load_global( - int8_data + quantization::quanitzed_access_granularity, - load_base_ptr + loading_offset + - quantization::quanitzed_access_granularity); - if (quantized_bits == 12) { - mem_access::load_global( - int8_data1, - load_base_ptr + loading_offset + - quantization::quanitzed_access_granularity * 2); - } + int8_data1, load_base_ptr + quantization::quanitzed_access_granularity * 2); } } - T store_buf[vector_size]; - uint16_t* q_buf = reinterpret_cast(store_buf); + } + T store_buf[vector_size]; + uint16_t* q_buf = reinterpret_cast(store_buf); #pragma unroll - for (int j = 0; j < vector_size; j++) { - uint16_t new_data; - if (j < 5 || quantized_bits != 12) { - new_data = (uint16_t)(q_buf_in >> (j * quantized_bits)); - } else { - if (j == 5) { - new_data = (uint16_t)(q_buf_in1); - new_data = (uint16_t)((new_data << 4) | (q_buf_in >> 60)); - } else - new_data = (uint16_t)(q_buf_in1 >> ((j - 6) * quantized_bits + 8)); - } + for (int j = 0; j < vector_size; j++) { + uint16_t new_data; + if (j < 5 || quantized_bits != 12) { + new_data = (uint16_t)(q_buf_in >> (j * quantized_bits)); + } else { + if (j == 5) { + new_data = (uint16_t)(q_buf_in1); + new_data = (uint16_t)((new_data << 4) | (q_buf_in >> 60)); + } else + new_data = (uint16_t)(q_buf_in1 >> ((j - 6) * quantized_bits + 8)); + } - uint16_t sign = (new_data & _sign_mask) >> (_mantisa_bits + _exponent_bits); - uint16_t dst_exponent = (new_data & _exponent_mask) >> _mantisa_bits; - uint16_t dst_mantisa = (new_data & _mantisa_mask); + uint16_t sign = (new_data & _sign_mask) >> (_mantisa_bits + _exponent_bits); + uint16_t dst_exponent = (new_data & _exponent_mask) >> _mantisa_bits; + uint16_t dst_mantisa = (new_data & _mantisa_mask); - if (dst_exponent != (1 << q_exponent_bits) - 1) - dst_exponent = (dst_exponent - ((1 << (_exponent_bits - 1)) - 1)) + - (1 << (q_exponent_bits - 1)) - 1; + if (dst_exponent != (1 << q_exponent_bits) - 1) + dst_exponent = (dst_exponent - ((1 << (_exponent_bits - 1)) - 1)) + + (1 << (q_exponent_bits - 1)) - 1; - q_buf[j] = ((sign << (q_exponent_bits + q_mantisa_bits)) | - (dst_exponent << q_mantisa_bits) | - (dst_mantisa << (q_mantisa_bits - _mantisa_bits))); - float up_cast = conversion::to(store_buf[j]); - store_buf[j] = conversion::to(up_cast * scale); - } - mem_access::store_global( - store_base_ptr + i * load_stride, store_buf); + q_buf[j] = + ((sign << (q_exponent_bits + q_mantisa_bits)) | (dst_exponent << q_mantisa_bits) | + (dst_mantisa << (q_mantisa_bits - _mantisa_bits))); + float up_cast = conversion::to(store_buf[j]); + store_buf[j] = conversion::to(up_cast * scale); } + mem_access::store_global(store_base_ptr, store_buf); } } @@ -386,12 +367,6 @@ INSTANTIATE_LAUNCH_QUANTIZATION(__nv_bfloat16, 23, 8); #endif INSTANTIATE_LAUNCH_QUANTIZATION(__half, 23, 8); -#define LAUNCH_FOR_DEQUANTIZATION_UNROLL(COUNT) \ - case COUNT: \ - apply_dequantization \ - <<>>(val, q_val, group_size); \ - break; - template void launch_dequantization(uint8_t* val, T* q_val, @@ -401,21 +376,14 @@ void launch_dequantization(uint8_t* val, int q_exponent_bits, cudaStream_t stream) { - const dim3 grid((num_groups + quantization::warps - 1) / quantization::warps); + int blocks = ((num_groups * group_size) - 1) / + (quantization::threads * (quantization::access_granularity / sizeof(T))) + + 1; + const dim3 grid(blocks); const dim3 block(quantization::threads); - - constexpr int vals_per_unroll = hw_warp_size * quantization::access_granularity / sizeof(T); - const int copy_unroll = (group_size + vals_per_unroll - 1) / vals_per_unroll; - DEQUANT_SWITCH(q_mantisa_bits * q_exponent_bits, [&] { - switch (copy_unroll) { - LAUNCH_FOR_DEQUANTIZATION_UNROLL(1) - LAUNCH_FOR_DEQUANTIZATION_UNROLL(2) - LAUNCH_FOR_DEQUANTIZATION_UNROLL(3) - LAUNCH_FOR_DEQUANTIZATION_UNROLL(4) - LAUNCH_FOR_DEQUANTIZATION_UNROLL(5) - LAUNCH_FOR_DEQUANTIZATION_UNROLL(6) - } + apply_dequantization + <<>>(val, q_val, group_size, (num_groups * group_size)); }); } #define INSTANTIATE_LAUNCH_DEQUANTIZATION(T, mantisa) \ From 2c51aba0b70a053252c6b0c95ecdecebb7b64d43 Mon Sep 17 00:00:00 2001 From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com> Date: Sat, 13 Apr 2024 13:40:31 -0700 Subject: [PATCH 10/36] Add custom reshaping for universal checkpoint (#5390) This PR adds more flexibility to define weight tensor reshaping for universal checkpointing. Currently universal checkpointing assumes a few patterns of partitioning for tensor parallelism, such as column/row wise partitioning of a 2-dim tensor. However, these are not flexible enough to define partitioning for more complex usages. Here are some examples: 1) MoE: The user may define the weight tensor for MoE's FFN as [n_experts * hidden_out, hidden_in]. For TP, we need to *view* this tensor as 3-dim tensor and partition it along `hidden_out` dimension. 2) GQA: The weights for QKV are often represented as one tensor and we may have Q, K and V with different sizes. The tensor shape will be [q_size + k_size + v_size, hidden]. We partition this along first dimension but for each Q, K, and V. In this case, we first need to partition Q, V, and V separately and then concatenate them to get a shard for TP. We propose a new pattern `PARAMETER_WITH_SUB_PARAMS` to support this. Here is the usage to cover the above use cases. You can define the view of the weight tensor and specify the dimension for partitioning based on the view. ```python from deepspeed.checkpoint import PARAMETER_WITH_SUB_PARAMS, SubparamShape info[PARAMETER_WITH_SUB_PARAMS] = [ asdict(SubparamShape(patterns=[layers_prefix + r"\d+moe.fc1.weight"], shape=(num_experts, hidden_out, hidden_in), partition_dim=1)), asdict(SubparamShape(patterns=[layers_prefix + r"\d+.qkv.weight"], shape=((q_size, k_size, v_size), hidden_size), partition_dim=0)), ... ] ``` The conversion script (`ds_to_universal.py`) merges TP-sharded weight tensors and the loader of universal checkpoints also partitions them following the information. Co-authored-by: Olatunji Ruwase --- deepspeed/checkpoint/__init__.py | 2 +- deepspeed/checkpoint/constants.py | 4 +++ deepspeed/checkpoint/ds_to_universal.py | 38 ++++++++++++++++++++ deepspeed/checkpoint/universal_checkpoint.py | 33 +++++++++++++++-- 4 files changed, 74 insertions(+), 3 deletions(-) diff --git a/deepspeed/checkpoint/__init__.py b/deepspeed/checkpoint/__init__.py index c9822693867d..1f645a494ade 100644 --- a/deepspeed/checkpoint/__init__.py +++ b/deepspeed/checkpoint/__init__.py @@ -15,6 +15,6 @@ from .zero_checkpoint import ZeROCheckpoint -from .universal_checkpoint import enable_universal_checkpoint +from .universal_checkpoint import enable_universal_checkpoint, SubparamShape from .constants import * diff --git a/deepspeed/checkpoint/constants.py b/deepspeed/checkpoint/constants.py index b3f199a67b98..046bc242002f 100644 --- a/deepspeed/checkpoint/constants.py +++ b/deepspeed/checkpoint/constants.py @@ -74,6 +74,8 @@ # Similarly, load_hp_checkpoint_state has to take the needed actions when loading from universal. PARAM_N_SUB_PARAMS = "param_n_sub_params" +SUB_PARAM_SHAPE = "sub_param_shape" + # Regex list of parameters that require special handling VOCABULARY_PARAMETER_PATTERNS = 'vocabulary_parameter_patterns' PIPELINE_REPLICATED_PARAMETER_PATTERNS = 'pipeline_replicated_parameter_patterns' @@ -81,3 +83,5 @@ PARAMETER_WITH_ROW_PARALLELISM_PATTERNS = 'parameter_with_row_parallelism_patterns' TP_REPLICATED_PARAMETER_PATTERNS = 'tp_replicated_parameter_patterns' PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0 = 'parameter_with_2_sub_params_cat_dim_0' +PARAMETER_WITH_SUB_PARAMS = 'parameter_with_sub_params' +SUB_PARAMS_SHAPE = 'sub_params_shape' diff --git a/deepspeed/checkpoint/ds_to_universal.py b/deepspeed/checkpoint/ds_to_universal.py index 9ec5d0b169e4..d5eca81c804f 100755 --- a/deepspeed/checkpoint/ds_to_universal.py +++ b/deepspeed/checkpoint/ds_to_universal.py @@ -6,6 +6,7 @@ # DeepSpeed Team from functools import partial +from itertools import chain import argparse import glob import itertools @@ -28,6 +29,7 @@ PARAM, CAT_DIM, PARAM_N_SUB_PARAMS, + SUB_PARAM_SHAPE, VOCAB_TENSOR, UNIVERSAL_CHECKPOINT_INFO, VOCABULARY_PARAMETER_PATTERNS, @@ -36,6 +38,8 @@ PARAMETER_TO_AVERAGE_PATTERNS, PARAMETER_WITH_ROW_PARALLELISM_PATTERNS, PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0, + PARAMETER_WITH_SUB_PARAMS, + SubparamShape, ) @@ -180,8 +184,11 @@ def merge_tp_slices(ds_checkpoint, dir, slice_dir, tp_degree, name_and_shape): parameters_with_row_parallelism = universal_checkpoint_info.get(PARAMETER_WITH_ROW_PARALLELISM_PATTERNS, []) vocabulary_parameters = universal_checkpoint_info.get(VOCABULARY_PARAMETER_PATTERNS, []) parameters_with_2_sub_params_cat_dim_0 = universal_checkpoint_info.get(PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0, []) + parameter_with_sub_params = universal_checkpoint_info.get(PARAMETER_WITH_SUB_PARAMS, []) + unmatched_patterns = set(replicated_parameters + parameters_to_average + parameters_with_row_parallelism + vocabulary_parameters + parameters_with_2_sub_params_cat_dim_0) + unmatched_patterns.update(chain.from_iterable(SubparamShape(**s).patterns for s in parameter_with_sub_params)) def get_matched_pattern(patterns_, name_): matched_ = [pattern_ for pattern_ in patterns_ if re.match(pattern_, name_)] @@ -192,6 +199,17 @@ def get_matched_pattern(patterns_, name_): return pattern_ return None + def get_matched_sub_params_pattern(name_): + for subparam_shape_dict in parameter_with_sub_params: + subparam_shape = SubparamShape(**subparam_shape_dict) + for pattern_ in subparam_shape.patterns: + if re.match(pattern_, name_): + unmatched_patterns.discard(pattern_) + return subparam_shape + return None + + matched_sub_params_shape = get_matched_sub_params_pattern(name) + step_merged = _merge_zero_shards(slice_base_path, "step", tp_degree, shape) if step_merged: _save_checkpoint(os.path.join(param_base_path, f"step.pt"), step_merged[0]) @@ -219,6 +237,26 @@ def get_matched_pattern(patterns_, name_): param = torch.cat([merged_chunks_0, merged_chunks_1], dim=cat_dim) ckpt_dict[CAT_DIM] = cat_dim ckpt_dict[PARAM_N_SUB_PARAMS] = 2 + elif matched_sub_params_shape: + merged_chunks = [] + partition_dim = matched_sub_params_shape.partition_dim + + sub_dim_sizes = matched_sub_params_shape.shape[partition_dim] + if not isinstance(sub_dim_sizes, tuple): + sub_dim_sizes = (sub_dim_sizes, ) + + partition_shape = [sum(d) if isinstance(d, tuple) else d for d in matched_sub_params_shape.shape] + partition_shape = [d // tp_degree if i == partition_dim else d for i, d in enumerate(partition_shape)] + slices = [s.view(partition_shape) for s in slices] + + offset = 0 + for sub_dim_size in sub_dim_sizes: + part_sub_dim_size = sub_dim_size // tp_degree + merged_chunks.append( + torch.cat([s.narrow(partition_dim, offset, part_sub_dim_size) for s in slices], dim=partition_dim)) + offset += part_sub_dim_size + param = torch.cat(merged_chunks, dim=partition_dim) + ckpt_dict[SUB_PARAM_SHAPE] = matched_sub_params_shape else: cat_dim = 1 if get_matched_pattern(parameters_with_row_parallelism, name) else 0 # print(f"merge {name} with CAT DIM: {cat_dim}") diff --git a/deepspeed/checkpoint/universal_checkpoint.py b/deepspeed/checkpoint/universal_checkpoint.py index 86c8dc904b8c..064891a8bb54 100644 --- a/deepspeed/checkpoint/universal_checkpoint.py +++ b/deepspeed/checkpoint/universal_checkpoint.py @@ -7,7 +7,16 @@ import re import torch import types -from .constants import (FP32_WEIGHT_KEY, PARAM, VOCAB_TENSOR, CAT_DIM, PARAM_N_SUB_PARAMS) +from typing import List, Tuple, Union +from dataclasses import dataclass +from .constants import (FP32_WEIGHT_KEY, PARAM, VOCAB_TENSOR, CAT_DIM, PARAM_N_SUB_PARAMS, SUB_PARAM_SHAPE) + + +@dataclass +class SubparamShape: + patterns: List[str] + shape: Tuple[Union[Tuple[int], int]] + partition_dim: int def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size): @@ -76,12 +85,32 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size): # print(f"{full_hp_param.shape=} {full_param_numel=} {folder=}") # print(f"{dst_tensor.shape=} {dst_tensor.numel()=}{folder=}") + sub_param_shape = ckpt_dict.get(SUB_PARAM_SHAPE, None) # since when we do many to 1 on tp we cat sometimes on dim=0 and other times on dim=1 we have to do exactly the same in reverse # special case is when a single parameter is effectively a container for multiple sub parameters # (more details at PARAM_N_SUB_PARAMS definition) chunk_dim = ckpt_dict.get(CAT_DIM, 0) n_sub_params = ckpt_dict.get(PARAM_N_SUB_PARAMS, 1) - if n_sub_params > 1: + if sub_param_shape: + partition_dim = sub_param_shape.partition_dim + sub_dim_sizes = sub_param_shape.shape[partition_dim] + if not isinstance(sub_dim_sizes, tuple): + sub_dim_sizes = (sub_dim_sizes, ) + + partition_shape = [sum(d) if isinstance(d, tuple) else d for d in sub_param_shape.shape] + full_hp_param = full_hp_param.view(partition_shape) + + offset = 0 + merged_chunks = [] + for sub_dim_size in sub_dim_sizes: + sub_params_tp_slice = full_hp_param.narrow(partition_dim, + offset, sub_dim_size).chunk(tp_world_size, + dim=partition_dim)[tp_rank] + merged_chunks.append(sub_params_tp_slice) + offset += sub_dim_size + tp_hp_slice = torch.cat(merged_chunks, dim=partition_dim) + + elif n_sub_params > 1: sub_params = full_hp_param.chunk(n_sub_params, dim=chunk_dim) sub_params_tp_slice = [p.chunk(tp_world_size, dim=chunk_dim)[tp_rank] for p in sub_params] tp_hp_slice = torch.cat(sub_params_tp_slice, dim=chunk_dim) From f69f8840fc62e6cbbbad9be4216729158611127e Mon Sep 17 00:00:00 2001 From: Raza Sikander <54884406+raza-sikander@users.noreply.github.com> Date: Sun, 14 Apr 2024 02:11:14 +0530 Subject: [PATCH 11/36] Removal of cuda hardcoded string with get_device function (#5351) In UTs removed 'cuda' string hardcode by replacing with device variable set to get_accelerator().device_name() Co-authored-by: Shaik Raza Sikander Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- tests/unit/hybrid_engine/test_he_lora.py | 7 +++++-- tests/unit/inference/test_stable_diffusion.py | 6 +++--- .../transformer/inference/test_attention.py | 20 +++++++++---------- .../ops/transformer/inference/test_gelu.py | 6 ++++-- .../transformer/inference/test_layer_norm.py | 13 ++++++------ .../ops/transformer/inference/test_softmax.py | 3 ++- 6 files changed, 31 insertions(+), 24 deletions(-) diff --git a/tests/unit/hybrid_engine/test_he_lora.py b/tests/unit/hybrid_engine/test_he_lora.py index ea27239ed55e..5f53a237c340 100644 --- a/tests/unit/hybrid_engine/test_he_lora.py +++ b/tests/unit/hybrid_engine/test_he_lora.py @@ -15,6 +15,7 @@ import numpy.testing as npt from unit.common import DistributedTest from deepspeed.ops.op_builder import InferenceBuilder +from deepspeed.accelerator import get_accelerator if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: pytest.skip("This op had not been implemented on this system.", allow_module_level=True) @@ -125,7 +126,8 @@ def get_model(self, model_name): model_config.dropout = 0.0 model = AutoModelForCausalLM.from_pretrained(model_name, config=model_config) model = model.half() - model = model.to(f'cuda:{local_rank}') + device = get_accelerator().device_name() + model = model.to(f'{device}:{local_rank}') return model def get_tokenizer(self, model_name): @@ -190,7 +192,8 @@ def test_lora(self, batch_size, model_name, zero_stage, offload_device): model.train() batch = tokenizer(train_sentences, max_length=16, padding="max_length", truncation=True, return_tensors="pt") - batch = to_device(batch, f'cuda:{local_rank}') + device = get_accelerator().device_name() + batch = to_device(batch, f'{device}:{local_rank}') batch["labels"] = batch["input_ids"] outputs = model(**batch, use_cache=False) loss = outputs.loss diff --git a/tests/unit/inference/test_stable_diffusion.py b/tests/unit/inference/test_stable_diffusion.py index ac39b7ab12fa..775a02c2e878 100644 --- a/tests/unit/inference/test_stable_diffusion.py +++ b/tests/unit/inference/test_stable_diffusion.py @@ -20,14 +20,14 @@ class TestStableDiffusion(DistributedTest): def test(self): from diffusers import DiffusionPipeline from image_similarity_measures.quality_metrics import rmse - generator = torch.Generator(device=get_accelerator().current_device()) + dev = get_accelerator().device_name() + generator = torch.Generator(device=dev) seed = 0xABEDABE7 generator.manual_seed(seed) prompt = "a dog on a rocket" model = "prompthero/midjourney-v4-diffusion" local_rank = int(os.getenv("LOCAL_RANK", "0")) - device = torch.device(f"cuda:{local_rank}") - + device = torch.device(f"{dev}:{local_rank}") pipe = DiffusionPipeline.from_pretrained(model, torch_dtype=torch.half) pipe = pipe.to(device) baseline_image = pipe(prompt, guidance_scale=7.5, generator=generator).images[0] diff --git a/tests/unit/ops/transformer/inference/test_attention.py b/tests/unit/ops/transformer/inference/test_attention.py index 13abe8b915c7..0521245bcdf3 100644 --- a/tests/unit/ops/transformer/inference/test_attention.py +++ b/tests/unit/ops/transformer/inference/test_attention.py @@ -31,23 +31,23 @@ def test_attention(BATCH, H, N_CTX, D_HEAD, causal, use_flash, dtype=torch.float pytest.skip("triton has to be installed for the test") minus_inf = -65504.0 - + dev = deepspeed.accelerator.get_accelerator().device_name() # skip autotune in testing from deepspeed.ops.transformer.inference.triton.matmul_ext import fp16_matmul fp16_matmul.skip_autotune() from deepspeed.ops.transformer.inference.triton.attention import _triton_attention, _triton_packed_flash torch.manual_seed(20) - q = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5) - k = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5) - v = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5) + q = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device=dev).normal_(mean=0, std=.5) + k = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device=dev).normal_(mean=0, std=.5) + v = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device=dev).normal_(mean=0, std=.5) sm_scale = 0.3 # reference implementation p = torch.matmul(q, k.transpose(2, 3)) * sm_scale score = p - mask = torch.zeros((BATCH, H, N_CTX, N_CTX), dtype=dtype, device="cuda") - M = torch.tril(torch.ones((N_CTX, N_CTX), device="cuda")) + mask = torch.zeros((BATCH, H, N_CTX, N_CTX), dtype=dtype, device=dev) + M = torch.tril(torch.ones((N_CTX, N_CTX), device=dev)) if causal: for z in range(BATCH): for h in range(H): @@ -58,7 +58,7 @@ def test_attention(BATCH, H, N_CTX, D_HEAD, causal, use_flash, dtype=torch.float context = ref_out # adjust it to expected tensor format and run test - qkv = torch.randn((BATCH, N_CTX, 3 * H * D_HEAD), dtype=dtype, device='cuda', requires_grad=False) + qkv = torch.randn((BATCH, N_CTX, 3 * H * D_HEAD), dtype=dtype, device=dev, requires_grad=False) qkv[:, :, :H * D_HEAD] = q.permute(0, 2, 1, 3).contiguous().reshape((BATCH, N_CTX, H * D_HEAD)) qkv[:, :, 1 * H * D_HEAD:2 * H * D_HEAD] = k.permute(0, 2, 1, 3).contiguous().reshape((BATCH, N_CTX, H * D_HEAD)) qkv[:, :, 2 * H * D_HEAD:] = v.permute(0, 2, 1, 3).contiguous().reshape((BATCH, N_CTX, H * D_HEAD)) @@ -66,12 +66,12 @@ def test_attention(BATCH, H, N_CTX, D_HEAD, causal, use_flash, dtype=torch.float if use_flash: if not get_accelerator().is_triton_supported(): pytest.skip("triton flash attention is supported when the compute capability > 8.0") - triton_mask = torch.zeros((BATCH, 1, 1, N_CTX), dtype=dtype, device="cuda") + triton_mask = torch.zeros((BATCH, 1, 1, N_CTX), dtype=dtype, device=dev) if not causal: - lengths = torch.randint(N_CTX - 8, N_CTX, (BATCH, 1), device='cuda') + lengths = torch.randint(N_CTX - 8, N_CTX, (BATCH, 1), device=dev) for i, l in enumerate(lengths): triton_mask[i, ..., l:] = minus_inf - mask = torch.zeros((BATCH, H, N_CTX, N_CTX), dtype=dtype, device="cuda") + mask = torch.zeros((BATCH, H, N_CTX, N_CTX), dtype=dtype, device=dev) for b in range(BATCH): mask[b, :, :, lengths[b]:] = minus_inf ref_out = ref_torch_attention(q, k, v, mask, sm_scale) diff --git a/tests/unit/ops/transformer/inference/test_gelu.py b/tests/unit/ops/transformer/inference/test_gelu.py index de924848bfb4..675860b00bdb 100644 --- a/tests/unit/ops/transformer/inference/test_gelu.py +++ b/tests/unit/ops/transformer/inference/test_gelu.py @@ -42,8 +42,9 @@ def run_gelu_ds(activations, use_triton_ops=False): from deepspeed.ops.transformer.inference.triton import gelu return gelu(activations) + device = deepspeed.accelerator.get_accelerator().device_name() channels = activations.shape[-1] - bias = torch.zeros((channels), dtype=activations.dtype, device='cuda') + bias = torch.zeros((channels), dtype=activations.dtype, device=device) global inference_module if inference_module is None: inference_module = InferenceBuilder().load() @@ -60,7 +61,8 @@ def run_gelu_ds(activations, use_triton_ops=False): @pytest.mark.parametrize("dtype", [torch.float16]) @pytest.mark.parametrize("use_triton_ops", [True, False]) def test_gelu(batch, sequence, channels, dtype, use_triton_ops): - activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device='cuda') + device = deepspeed.accelerator.get_accelerator().device_name() + activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=device) activations_ref = activations_ds.clone().detach() if not deepspeed.HAS_TRITON and use_triton_ops: diff --git a/tests/unit/ops/transformer/inference/test_layer_norm.py b/tests/unit/ops/transformer/inference/test_layer_norm.py index 711a35213015..9eac612aa29c 100644 --- a/tests/unit/ops/transformer/inference/test_layer_norm.py +++ b/tests/unit/ops/transformer/inference/test_layer_norm.py @@ -175,19 +175,20 @@ def test_layer_norm_residual_store_pre_ln_res(batch, seq_len, channels, dtype): def test_triton_layer_norm(M, N, dtype, residual, input_bias, eps=1e-5, device='cuda'): if not deepspeed.HAS_TRITON: pytest.skip("triton has to be installed for the test") + dev = get_accelerator().device_name() torch.manual_seed(0) # create data x_shape = (M, N) w_shape = (x_shape[-1], ) - weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=False) - bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=False) - x_bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=False) - x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda') + weight = torch.rand(w_shape, dtype=dtype, device=dev, requires_grad=False) + bias = torch.rand(w_shape, dtype=dtype, device=dev, requires_grad=False) + x_bias = torch.rand(w_shape, dtype=dtype, device=dev, requires_grad=False) + x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device=dev) dy = .1 * torch.randn_like(x) if residual: - res = torch.rand(x_shape, dtype=dtype, device='cuda', requires_grad=False) + res = torch.rand(x_shape, dtype=dtype, device=dev, requires_grad=False) else: - res = torch.zeros(x_shape, dtype=dtype, device='cuda', requires_grad=False) + res = torch.zeros(x_shape, dtype=dtype, device=dev, requires_grad=False) x.requires_grad_(True) # forward pass if residual or input_bias: diff --git a/tests/unit/ops/transformer/inference/test_softmax.py b/tests/unit/ops/transformer/inference/test_softmax.py index 76046f31e01a..7d0d6e14b651 100644 --- a/tests/unit/ops/transformer/inference/test_softmax.py +++ b/tests/unit/ops/transformer/inference/test_softmax.py @@ -43,7 +43,8 @@ def run_softmax_ds(input, use_triton_ops=False): def test_softmax(batch, sequence, channels, dtype, use_triton_ops): if not deepspeed.HAS_TRITON and use_triton_ops: pytest.skip("triton has to be installed for the test") - input_ds = torch.randn((batch, sequence, channels), dtype=dtype, device='cuda') + device = deepspeed.accelerator.get_accelerator().device_name() + input_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=device) input_ref = input_ds.clone().detach() ds_out = run_softmax_ds(input_ds, use_triton_ops) From 7b5b06602d5941cf7ea6170062d3f81c9002d788 Mon Sep 17 00:00:00 2001 From: Guanhua Wang Date: Sun, 14 Apr 2024 11:26:19 -0700 Subject: [PATCH 12/36] fix pagable h2d memcpy (#5301) ZeRO offload case Fix the issue of pageble h2d memcpy in step process. Now h2d memcpy uses pinned memory. Speedup h2d memcpy by 6x on single GPU and 4-5x on 8GPU node. cc @tjruwase --------- Co-authored-by: Olatunji Ruwase Co-authored-by: Ubuntu Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- deepspeed/runtime/zero/stage_1_and_2.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index 16b9c3c18919..225c085f6f2b 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -390,11 +390,16 @@ def __init__(self, # Note that the params in single_partition_of_fp32_groups is cloned and detached # from the origin params of the model. if not fp16_master_weights_and_gradients: - self.single_partition_of_fp32_groups.append(self.parallel_partitioned_bit16_groups[i][partition_id].to( - self.device).clone().float().detach()) + weights_partition = self.parallel_partitioned_bit16_groups[i][partition_id].to( + self.device).clone().float().detach() else: - self.single_partition_of_fp32_groups.append(self.parallel_partitioned_bit16_groups[i][partition_id].to( - self.device).clone().half().detach()) + weights_partition = self.parallel_partitioned_bit16_groups[i][partition_id].to( + self.device).clone().half().detach() + + if self.cpu_offload: + weights_partition = get_accelerator().pin_memory(weights_partition) + + self.single_partition_of_fp32_groups.append(weights_partition) # Set local optimizer to have flat params of its own partition. # After this, the local optimizer will only contain its own partition of params. @@ -1862,7 +1867,8 @@ def step(self, closure=None): # bit16_partitions[partition_id].data.copy_(fp32_partition.data) bit16_partitions = self.parallel_partitioned_bit16_groups[i] fp32_partition = self.single_partition_of_fp32_groups[i] - bit16_partitions[partition_id].data.copy_(fp32_partition.data) + bit16_partitions[partition_id].data.copy_( + fp32_partition.to(get_accelerator().current_device_name()).data) self.timers(OPTIMIZER_STEP_TIMER).stop() else: From 54c06872647ca60699f752e60ac1643bd05aa63c Mon Sep 17 00:00:00 2001 From: Nadav Elyahu <88962733+nelyahu@users.noreply.github.com> Date: Sun, 14 Apr 2024 22:40:52 +0300 Subject: [PATCH 13/36] stage3: efficient compute of scaled_global_grad_norm (#5256) using torch.norm instead of inefficient for loop --------- Co-authored-by: Olatunji Ruwase --- deepspeed/runtime/zero/stage3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index c6ff216edfcb..68cab13c4a93 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -15,7 +15,7 @@ from deepspeed.utils import logger from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce -from deepspeed.runtime.utils import inf, get_global_norm, is_model_parallel_parameter, get_only_unique_item +from deepspeed.runtime.utils import inf, is_model_parallel_parameter, get_only_unique_item from deepspeed.runtime.zero.partition_parameters import * from deepspeed.runtime.zero.config import ZeroStageEnum from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum @@ -2027,7 +2027,7 @@ def step(self, closure=None): return norm_groups = self._get_norm_groups() - scaled_global_grad_norm = get_global_norm(norm_list=norm_groups) + scaled_global_grad_norm = torch.norm(torch.stack(norm_groups)) # Stash unscaled gradient norm self._global_grad_norm = scaled_global_grad_norm / self.loss_scale From e3d873a00ec2a78f74796f9a1a0d571f310e5f87 Mon Sep 17 00:00:00 2001 From: "ZHENG, Zhen" Date: Tue, 16 Apr 2024 01:23:28 +0800 Subject: [PATCH 14/36] Fix the FP6 kernels compilation problem on non-Ampere GPUs. (#5333) Refine the guards of FP6 kernel compilation. Fix the `undefined symbol` problem of FP6 kernels on non-Ampere architectures. Related issue: https://github.com/microsoft/DeepSpeed-MII/issues/443. --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Michael Wyatt --- .../v2/kernels/core_ops/core_ops.cpp | 4 +-- .../cuda_linear/include/kernel_matmul.cuh | 11 ++++++++ .../cuda_linear/include/kernel_reduction.cuh | 5 ++++ .../cuda_linear/include/ptx_cp.async.cuh | 25 ++++++++++++++++--- .../core_ops/cuda_linear/include/ptx_mma.cuh | 16 ++++++++++-- .../cuda_linear/include/utils_core.cuh | 4 +-- .../cuda_linear/include/utils_gmem.cuh | 4 +-- .../include/utils_paralleldequant.cuh | 4 +-- .../cuda_linear/include/weight_prepacking.h | 5 ++++ ..._linear_kernels.cpp => linear_kernels.cpp} | 2 +- ...cuda_linear_kernels.h => linear_kernels.h} | 7 ++++-- .../{fp6_linear.cu => linear_kernels_cuda.cu} | 11 +++++--- .../{fp6_linear.cuh => linear_kernels_cuda.h} | 5 ++++ deepspeed/inference/v2/modules/heuristics.py | 11 +++++++- op_builder/inference_core_ops.py | 13 ++-------- .../modules/test_quantized_linear_module.py | 3 +-- 16 files changed, 96 insertions(+), 34 deletions(-) rename deepspeed/inference/v2/kernels/core_ops/cuda_linear/{cuda_linear_kernels.cpp => linear_kernels.cpp} (99%) rename deepspeed/inference/v2/kernels/core_ops/cuda_linear/{cuda_linear_kernels.h => linear_kernels.h} (85%) rename deepspeed/inference/v2/kernels/core_ops/cuda_linear/{fp6_linear.cu => linear_kernels_cuda.cu} (98%) rename deepspeed/inference/v2/kernels/core_ops/cuda_linear/{fp6_linear.cuh => linear_kernels_cuda.h} (94%) diff --git a/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp b/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp index 2397b0694696..3f36a6bf01cb 100644 --- a/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp +++ b/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp @@ -8,9 +8,9 @@ #include "bias_activation.h" #include "blas.h" -#include "cuda_linear_kernels.h" #include "gated_activation_kernels.h" #include "layer_norm.h" +#include "linear_kernels.h" #include "rms_norm.h" PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) @@ -35,7 +35,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) m.def("rms_norm", &rms_norm, "DeepSpeed rms norm in CUDA"); m.def("rms_pre_norm", &rms_pre_norm, "DeepSpeed rms pre norm in CUDA"); - // cuda_linear_kernels.h + // linear_kernels.h m.def("cuda_wf6af16_linear", &cuda_wf6af16_linear, "DeepSpeed Wf6Af16 linear in CUDA"); m.def( "preprocess_weight", &preprocess_weight, "preprocess the FP16 weight to be 2bit and 4 bit"); diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh index aa6ea6c4b1c2..0262baef4614 100644 --- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh @@ -5,6 +5,9 @@ // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 +#ifndef DEEPSPEED_CUDA_LINEAR_KERNEL_MATMUL_CUH +#define DEEPSPEED_CUDA_LINEAR_KERNEL_MATMUL_CUH + #include "configs.h" #include "utils_core.cuh" #include "utils_gmem.cuh" @@ -26,6 +29,8 @@ __global__ void QUANT_GEMM_Kernel(const uint4* Weight1, const size_t K_Global, int Split_K) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 900 + #ifdef DEBUG_MODE assert(K_Global % TilingConfig::TILE_K == 0); assert(M_Global % TilingConfig::TILE_M == 0); @@ -258,4 +263,10 @@ __global__ void QUANT_GEMM_Kernel(const uint4* Weight1, else BlockGlobalPTR[j + i * M_Global] = smem_CFrag[i][j]; } + +#else +#warning "The FP6 functions are only available on Ampere GPUs." +#endif } + +#endif diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh index 8c49f8b0b3a5..c417e6a46a7c 100644 --- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh @@ -5,6 +5,9 @@ // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 +#ifndef DEEPSPEED_CUDA_LINEAR_KERNEL_REDUCTION_CUH +#define DEEPSPEED_CUDA_LINEAR_KERNEL_REDUCTION_CUH + #include #include #include @@ -36,3 +39,5 @@ __global__ void SplitK_Reduction(half* C, #pragma unroll for (int i = 0; i < HALF_PER_128BIT; i++) THREAD_GPTR_C[i] = __float2half_rn(Results[i]); } + +#endif diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh index 7f36cfd5d961..39874e023539 100644 --- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh @@ -5,8 +5,8 @@ // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 -#ifndef PTX_CP_ASYNC_CUH -#define PTX_CP_ASYNC_CUH +#ifndef DEEPSPEED_CUDA_LINEAR_PTX_CP_ASYNC_CUH +#define DEEPSPEED_CUDA_LINEAR_PTX_CP_ASYNC_CUH #include #include @@ -17,6 +17,7 @@ __device__ __forceinline__ void cp_async(half* smem_ptr, const half* global_ptr, bool pred_guard = true) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 static_assert(SizeInBytes == 16, "Size is not supported"); unsigned smem_int_ptr = __cvta_generic_to_shared(smem_ptr); asm volatile( @@ -28,25 +29,43 @@ __device__ __forceinline__ void cp_async(half* smem_ptr, "r"(smem_int_ptr), "l"(global_ptr), "n"(SizeInBytes)); +#else +#warning "The async copy functions are only supported on Ampere and newer architectures" +#endif } /// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block. __device__ __forceinline__ void cp_async_group_commit() { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 asm volatile("cp.async.commit_group;\n" ::); +#else +#warning "The async copy functions are only supported on Ampere and newer architectures" +#endif } /// Blocks until all but previous cp.async.commit_group operations have committed. template __device__ __forceinline__ void cp_async_wait_group() { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 asm volatile("cp.async.wait_group %0;\n" ::"n"(N)); +#else +#warning "The async copy functions are only supported on Ampere and newer architectures" +#endif } /// Blocks until all previous cp.async.commit_group operations have committed. // cp.async.wait_all is equivalent to : // cp.async.commit_group; // cp.async.wait_group 0; -__device__ __forceinline__ void cp_async_wait_all() { asm volatile("cp.async.wait_all;\n" ::); } +__device__ __forceinline__ void cp_async_wait_all() +{ +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + asm volatile("cp.async.wait_all;\n" ::); +#else +#warning "The async copy functions are only supported on Ampere and newer architectures" +#endif +} #endif diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh index f13abe036279..8023629caac9 100644 --- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh @@ -5,8 +5,8 @@ // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 -#ifndef PTX_MMA_CUH -#define PTX_MMA_CUH +#ifndef DEEPSPEED_CUDA_LINEAR_PTX_MMA_CUH +#define DEEPSPEED_CUDA_LINEAR_PTX_MMA_CUH #include #include @@ -22,6 +22,7 @@ __device__ __forceinline__ void B_FromSharedToReg( half __restrict__ (*read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8], int slice_id) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #ifdef DEBUG_MODE static_assert((TilingConfig::WARP_COL_MMA_TENSORS == 1) || (TilingConfig::WARP_COL_MMA_TENSORS % 2 == 0)); @@ -54,6 +55,9 @@ __device__ __forceinline__ void B_FromSharedToReg( smem_local_ptr += 16 * (WARP_K + PADDING_SHARED_MEM_FOR_B_8) * sizeof(half); } } +#else +#warning "The matrix load functions are only supported on Ampere and newer architectures" +#endif } #else // Debug: Whether ldmatrix.trans is required??? @@ -64,6 +68,7 @@ __device__ __forceinline__ void B_FromSharedToReg( half __restrict__ (*read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8], int k_offset) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #ifdef DEBUG_MODE static_assert((TilingConfig::WARP_COL_MMA_TENSORS == 1) || (TilingConfig::WARP_COL_MMA_TENSORS % 2 == 0)); @@ -96,6 +101,9 @@ __device__ __forceinline__ void B_FromSharedToReg( smem_local_ptr += 16 * (WARP_K + PADDING_SHARED_MEM_FOR_B_8) * sizeof(half); } } +#else +#warning "The matrix load functions are only supported on Ampere and newer architectures" +#endif } #endif @@ -103,6 +111,7 @@ __device__ __forceinline__ void MMA_FP16_M16N8K16(uint32_t __restrict__ c[], uint32_t __restrict__* a, uint32_t __restrict__* b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 asm volatile( "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" "{ %0, %1, %2, %3}," @@ -120,6 +129,9 @@ __device__ __forceinline__ void MMA_FP16_M16N8K16(uint32_t __restrict__ c[], "r"(c[1]), "r"(c[2]), "r"(c[3])); +#else +#warning "The mma functions are only implemented for Ampere and newer architectures" +#endif } #endif diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh index 713cebc57e33..a65575a1ba5a 100644 --- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh @@ -5,8 +5,8 @@ // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 -#ifndef UTILS_CORE_CUH -#define UTILS_CORE_CUH +#ifndef DEEPSPEED_CUDA_LINEAR_UTILS_CORE_CUH +#define DEEPSPEED_CUDA_LINEAR_UTILS_CORE_CUH #include diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh index 62b77edaa37a..d0c58352cd56 100644 --- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh @@ -5,8 +5,8 @@ // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 -#ifndef UTILS_GMEM_CUH -#define UTILS_GMEM_CUH +#ifndef DEEPSPEED_CUDA_LINEAR_UTILS_GMEM_CUH +#define DEEPSPEED_CUDA_LINEAR_UTILS_GMEM_CUH #include #include "configs.h" diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh index ff13868c1347..9f035f12cfcd 100644 --- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh @@ -5,8 +5,8 @@ // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 -#ifndef UTILS_PARALLELDEQUANT_CUH -#define UTILS_PARALLELDEQUANT_CUH +#ifndef DEEPSPEED_CUDA_LINEAR_UTILS_PARALLELDEQUANT_CUH +#define DEEPSPEED_CUDA_LINEAR_UTILS_PARALLELDEQUANT_CUH #include #include diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h index c8cc7243f341..98805e6c111c 100644 --- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h @@ -5,6 +5,9 @@ // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 +#ifndef DEEPSPEED_CUDA_LINEAR_WEIGHT_PREPACKING_H +#define DEEPSPEED_CUDA_LINEAR_WEIGHT_PREPACKING_H + #include #include #include @@ -202,3 +205,5 @@ void weight_matrix_prepacking(int* FP6Weights, size_t M, size_t K) for (size_t i = 0; i < BytesPerThread_4bit * 32 / 4; i++) BitInterleaving_4bit(Weight_4bit + 4 * i); } + +#endif diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.cpp similarity index 99% rename from deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp rename to deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.cpp index 677bec22ded8..3b4966eb822b 100644 --- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.cpp @@ -5,7 +5,7 @@ #include -#include "cuda_linear_kernels.h" +#include "linear_kernels.h" namespace { diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.h b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.h similarity index 85% rename from deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.h rename to deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.h index 0f5882d519ca..01a6b7c18af8 100644 --- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.h +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.h @@ -3,13 +3,14 @@ // DeepSpeed Team -#pragma once +#ifndef DEEPSPEED_CUDA_LINEAR_KERNELS_H +#define DEEPSPEED_CUDA_LINEAR_KERNELS_H #include #include #include "ds_kernel_utils.h" -#include "fp6_linear.cuh" +#include "linear_kernels_cuda.h" void cuda_wf6af16_linear(torch::Tensor& output, torch::Tensor& hidden_states, @@ -23,3 +24,5 @@ void cuda_wf6af16_linear(torch::Tensor& output, int split_k); std::vector preprocess_weight(torch::Tensor& Weight); + +#endif diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.cu similarity index 98% rename from deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu rename to deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.cu index 64e06a5435c6..ea0203c42f84 100644 --- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.cu @@ -19,6 +19,8 @@ #include #include +#include "linear_kernels_cuda.h" + template static void Kernel_Ex(cudaStream_t stream, const uint4* Weight1, @@ -50,7 +52,7 @@ static void Kernel_Ex(cudaStream_t stream, size_t dimM = M_Global * Split_K / TilingConfig::TILE_M; dim3 GridDim(dimN, dimM, 1); dim3 BlockDim(WARP_SIZE * TilingConfig::BLOCK_WARPS, 1, 1); -// + #ifdef DEBUG_MODE printf( "GridDim.x: %d, GridDim.y: %d, GridDim.z: %d, BlockDim.x: %d, BlockDim.y: %d, BlockDim.z: " @@ -64,6 +66,7 @@ static void Kernel_Ex(cudaStream_t stream, SHMEM_SZ); printf("\n"); #endif + QUANT_GEMM_Kernel<<>>( Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K); } @@ -121,7 +124,7 @@ cudaError_t fp6_linear_kernel(cudaStream_t stream, break; default: if (N_PowerOf2 % 128 != 0) { - printf("QuantLLM_API Error: Unsupported N dimension %d!\n", N_PowerOf2); + printf("QuantLLM_API Error: Unsupported N dimension %lu!\n", N_PowerOf2); return cudaErrorUnknown; } Kernel_Ex, half>( @@ -192,7 +195,7 @@ cudaError_t fp6_linear_kernel(cudaStream_t stream, break; default: if (N_PowerOf2 % 128 != 0) { - printf("QuantLLM_API Error: Unsupported N dimension %d!\n", N_PowerOf2); + printf("QuantLLM_API Error: Unsupported N dimension %lu!\n", N_PowerOf2); return cudaErrorUnknown; } Kernel_Ex, float>(stream, @@ -236,7 +239,7 @@ calling our CUDA kernel. torch::Tensor fp6_linear_forward_cuda(torch::Tensor _in_feats, torch::Tensor _weights, torch::Tensor _scales, - int splitK = 1) + int splitK) { int num_in_feats = _in_feats.size(0); int num_in_channels = _in_feats.size(1); diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.h similarity index 94% rename from deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cuh rename to deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.h index 95f7f6050c15..6a83290f0cb5 100644 --- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cuh +++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.h @@ -5,6 +5,9 @@ // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112 +#ifndef DEEPSPEED_CUDA_LINEAR_FP6_LINEAR_CUH +#define DEEPSPEED_CUDA_LINEAR_FP6_LINEAR_CUH + #include #include #include @@ -44,3 +47,5 @@ void weight_matrix_prepacking(int* FP6Weights, size_t M, size_t K); * Weight prepacking (Pytorch interface). */ torch::Tensor weight_matrix_prepacking_cpu(torch::Tensor fp6_tensor, size_t M, size_t K); + +#endif diff --git a/deepspeed/inference/v2/modules/heuristics.py b/deepspeed/inference/v2/modules/heuristics.py index d176206f3c60..f719e299a4b2 100644 --- a/deepspeed/inference/v2/modules/heuristics.py +++ b/deepspeed/inference/v2/modules/heuristics.py @@ -90,8 +90,17 @@ def instantiate_linear(linear_config: DSLinearConfig, engine_config: RaggedInfer if quantization_mode is None: config = ConfigBundle(name="blas_fp_linear", config=linear_config) else: - # Currently, we only support ``quantized_wf6af16_linear``. + # Currently, we only support ``quantized_wf6af16_linear`` on NVIDIA Ampere GPUs. if quantization_mode == "wf6af16": + import torch + if not torch.cuda.is_available(): #ignore-cuda + raise ValueError("WF6AF16 quantization is only supported on CUDA") + else: + is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None + if is_rocm_pytorch: + raise ValueError("WF6AF16 quantization is only supported on NVIDIA GPUs") + elif torch.cuda.get_device_properties(0).major != 8: #ignore-cuda + raise ValueError("WF6AF16 quantization is only supported on Ampere architectures") config = ConfigBundle(name="quantized_wf6af16_linear", config=linear_config) else: raise ValueError(f"Unsupported quantization mode: {quantization_mode}") diff --git a/op_builder/inference_core_ops.py b/op_builder/inference_core_ops.py index 3c53774d0a50..d1957f39d9a8 100755 --- a/op_builder/inference_core_ops.py +++ b/op_builder/inference_core_ops.py @@ -57,8 +57,6 @@ def get_prefix(self): return "deepspeed" if os.path.isdir(ds_path) else ".." def sources(self): - import torch - sources = [ "inference/v2/kernels/core_ops/core_ops.cpp", "inference/v2/kernels/core_ops/bias_activations/bias_activation.cpp", @@ -69,17 +67,10 @@ def sources(self): "inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm_cuda.cu", "inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cpp", "inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu", + "inference/v2/kernels/core_ops/cuda_linear/linear_kernels.cpp", + "inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.cu", ] - # The source files with specific GPU architecture requirements. - if not self.is_rocm_pytorch() and torch.cuda.is_available(): #ignore-cuda - cuda_capability = torch.cuda.get_device_properties(0).major #ignore-cuda - if cuda_capability != 8: - self.warning("FP6 quantization kernel is only supported on Ampere architectures") - else: - sources.append("inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu") - sources.append("inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp") - prefix = self.get_prefix() sources = [os.path.join(prefix, src) for src in sources] return sources diff --git a/tests/unit/inference/v2/modules/test_quantized_linear_module.py b/tests/unit/inference/v2/modules/test_quantized_linear_module.py index a7bd965072ac..050f21c3bf3a 100644 --- a/tests/unit/inference/v2/modules/test_quantized_linear_module.py +++ b/tests/unit/inference/v2/modules/test_quantized_linear_module.py @@ -145,10 +145,9 @@ def _fp6_quantized_linear_helper(tokens: int, ActivationType.ReGLU, ActivationType.SiGLU, ] -all_tokens = [1, 37] +all_tokens = [37] all_in_out_channels = [ (4096, 4096), - (8192, 28672), ] From 2b3d31f9804f024f0cbb036771c35a5bdd9ef848 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Mon, 15 Apr 2024 12:53:41 -0700 Subject: [PATCH 15/36] Update version.txt after 0.14.1 release (#5413) **Auto-generated PR to update version.txt after a DeepSpeed release** Released version - 0.14.1 Author - @loadams Co-authored-by: loadams --- version.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.txt b/version.txt index 930e3000bdc9..e867cc2a66a8 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.14.1 +0.14.2 From 8949105369ccaacecc7b707601b787e4af730e89 Mon Sep 17 00:00:00 2001 From: Raza Sikander <54884406+raza-sikander@users.noreply.github.com> Date: Tue, 16 Apr 2024 01:28:56 +0530 Subject: [PATCH 16/36] Remove dtype(fp16) condition check for residual_add unit test (#5329) When the dtype is bf16 or fp32 the if condition is not satisfied and it continues execution instead of skipping when triton is not installed. Co-authored-by: Shaik Raza Sikander Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Olatunji Ruwase --- tests/unit/ops/transformer/inference/test_residual_add.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/ops/transformer/inference/test_residual_add.py b/tests/unit/ops/transformer/inference/test_residual_add.py index c2952f74ff2d..91830e25fc81 100644 --- a/tests/unit/ops/transformer/inference/test_residual_add.py +++ b/tests/unit/ops/transformer/inference/test_residual_add.py @@ -77,7 +77,7 @@ def run_residual_add_reference(hidden_state, residual, attn_output, attn_bias, f @pytest.mark.parametrize("use_triton_ops", [True, False]) def test_residual_add(inference_module, batch, sequence, hidden_dim, dtype, mlp_after_attn, add_bias, mp_size, pre_attn_norm, use_triton_ops): - if not deepspeed.HAS_TRITON and use_triton_ops and dtype == torch.float16: + if not deepspeed.HAS_TRITON and use_triton_ops: pytest.skip("triton has to be installed for the test") ds_out = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name()) residual = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name()) From 258e500f3fcd33fa42871ff87612d135399017ac Mon Sep 17 00:00:00 2001 From: YiSheng5 Date: Wed, 17 Apr 2024 01:40:45 +0800 Subject: [PATCH 17/36] [XPU] Use non_daemonic_proc by default on XPU device (#5412) Set non_daemonic_proc=True by default on XPU Device, using non_daemonic_proc for unit test. Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- tests/unit/common.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/unit/common.py b/tests/unit/common.py index 1fd83de81f02..a2593e703aef 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -248,6 +248,10 @@ def _launch_procs(self, num_procs): f"Skipping test because not enough GPUs are available: {num_procs} required, {get_accelerator().device_count()} available" ) + if get_accelerator().device_name() == 'xpu': + self.non_daemonic_procs = True + self.reuse_dist_env = False + # Set start method to `forkserver` (or `fork`) mp.set_start_method('forkserver', force=True) From 0896503e2f4d3b12583dfe267e52db3a1d63b88d Mon Sep 17 00:00:00 2001 From: inkcherry Date: Wed, 17 Apr 2024 02:21:26 +0800 Subject: [PATCH 18/36] Fix a convergence issues in TP topology caused by incorrect grad_norm. (#5411) Some users are concerned that changes in TP topology during MOE training may potentially cause interference with experiments when noticing similar issues https://github.com/microsoft/Megatron-DeepSpeed/issues/151 https://github.com/microsoft/Megatron-DeepSpeed/pull/176/files We found a grad_norm calculation error after enabling TP. This error occurs because flattened grad of a params group is used, where the group contains both non-TP and TP parameters. Therefore, it is not possible to use a single attribute to determine whether flattened grad needs to compute the norm. In the current code logic, all params are assumed to be non-TP, resulting in only tp_rank0 grad participating in grad_norm computation. Other tp_rank grads have grad_norm_sum equal to 0. We tested and found that with TP=1 and TP=4, the difference in grad_norm is approximately twice (sqrt(4)). This aligns with the aforementioned issue. This problem should also affect dense models. Due to the absence of flattening params_group grad in bf16, this problem is avoided. We tested the loss curve on the 1.3B model. In cases where TP size increases the inconsistent gap should be larger. with this change 1.3B with EP=4 TP=4 &1 , fp16,mbs=1,gbs=16 ![image](https://github.com/microsoft/DeepSpeed/assets/27563729/855042c8-ac8a-4192-b465-5fa60c1a7c59) without this change 1.3B with EP=4 TP=4&1 ,fp16,mbs=1,gbs=16 ![image](https://github.com/microsoft/DeepSpeed/assets/27563729/66854d14-7b83-4b09-a669-b452d6157ea0) --------- Co-authored-by: Conglong Li --- deepspeed/runtime/fp16/fused_optimizer.py | 65 +++++++++++++++++++---- deepspeed/runtime/utils.py | 54 ++++++++----------- 2 files changed, 77 insertions(+), 42 deletions(-) diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py index af8050c4646a..bf1693307ea7 100755 --- a/deepspeed/runtime/fp16/fused_optimizer.py +++ b/deepspeed/runtime/fp16/fused_optimizer.py @@ -9,15 +9,16 @@ import torch from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors - from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer -from deepspeed.runtime.utils import get_global_norm, get_grad_norm, CheckOverflow, get_weight_norm, get_norm_with_moe_layers +from deepspeed.runtime.utils import get_global_norm, get_flattened_grad_norm, CheckOverflow, get_weight_norm, get_norm_with_moe_layers, is_model_parallel_parameter from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE from deepspeed.utils import logger, log_dist from deepspeed.utils.torch import required_torch_version from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, CLIP_GRAD from deepspeed.accelerator import get_accelerator from deepspeed.moe.utils import is_moe_param_group +from deepspeed.runtime.constants import PIPE_REPLICATED +from deepspeed.utils.bwc import bwc_tensor_model_parallel_rank OVERFLOW_CHECK_TIMER = 'overflow_check' COMPUTE_NORM_TIMER = 'compute_norm' @@ -64,6 +65,8 @@ def __init__(self, self.fp16_groups_flat = [] self.fp32_groups_flat = [] + self.flatten_grad_norm_mask_list = [] + self.has_executed_step = False self._global_grad_norm = 0. # loop to deal with groups @@ -206,6 +209,40 @@ def override_loss_scale(self, loss_scale): self.custom_loss_scaler = True self.external_loss_scale = loss_scale + def _require_avoid_recompute_norm(self, p, tensor_model_parallel_rank): + # for filtering replicated tensors from tensor + if hasattr(p, PIPE_REPLICATED) and p.ds_pipe_replicated: + return True + if (tensor_model_parallel_rank > 0) and not is_model_parallel_parameter(p): + return True + + def _get_norm_mask_idx(self, group): + """The function preserves the parallel information for norm + from unflattened gradients. + + Args: + group (Iterable[Tensor] ): params group + + Returns: + torch.Tensor: A 2D tensor containing index ranges for each group, + where each row represents a [start index, end index]. + """ + group_mask_idx_list = [] + grad_flat_st_idx = 0 + grad_flat_en_idx = 0 + + for p in group: + grad_flat_en_idx = grad_flat_st_idx + p.numel() + if p.grad is not None and self._require_avoid_recompute_norm(p, bwc_tensor_model_parallel_rank(self.mpu)): + # merge range + if len(group_mask_idx_list) > 0 and grad_flat_st_idx == group_mask_idx_list[-1][-1]: + group_mask_idx_list[-1][-1] = grad_flat_en_idx + else: + group_mask_idx_list.append([grad_flat_st_idx, grad_flat_en_idx]) + grad_flat_st_idx = grad_flat_en_idx + + return torch.tensor(group_mask_idx_list, device=get_accelerator().current_device()) + def step(self, closure=None): """ Not supporting closure. @@ -251,23 +288,32 @@ def step(self, closure=None): for p in group ])) - for p in group: - p.grad = None - self.fp32_groups_flat[i].grad = grads_groups_flat[i] param_group = self.optimizer.param_groups[i] + + # split expert and non_expert grads for norm if self.has_moe_layers and is_moe_param_group(param_group): if param_group['name'] not in expert_grads_for_norm: expert_grads_for_norm[param_group['name']] = [] + expert_grads_for_norm[param_group['name']].append(self.fp32_groups_flat[i]) else: + # retrieves the required mask for calculating the norm of flat_grad + # perform this collect operation only once + if not self.has_executed_step: + cur_flat_grad_norm_mask = self._get_norm_mask_idx(group) + self.flatten_grad_norm_mask_list.append(cur_flat_grad_norm_mask) + non_experts_grads_for_norm.append(self.fp32_groups_flat[i]) - self.timers(COMPUTE_NORM_TIMER).start() + for p in group: + p.grad = None - all_groups_norm = get_grad_norm(non_experts_grads_for_norm, mpu=self.mpu) + self.timers(COMPUTE_NORM_TIMER).start() - self.timers(COMPUTE_NORM_TIMER).stop() + all_groups_norm = get_flattened_grad_norm(non_experts_grads_for_norm, + mpu=self.mpu, + grad_norm_mask=self.flatten_grad_norm_mask_list) if self.has_moe_layers: all_groups_norm = get_norm_with_moe_layers(all_groups_norm, @@ -276,6 +322,7 @@ def step(self, closure=None): norm_type=self.norm_type) scaled_global_grad_norm = get_global_norm(norm_list=[all_groups_norm]) + self.timers(COMPUTE_NORM_TIMER).stop() # Stash unscaled gradient norm self._global_grad_norm = scaled_global_grad_norm / self.cur_scale @@ -298,7 +345,7 @@ def step(self, closure=None): updated_params = _unflatten_dense_tensors(self.fp32_groups_flat[i], self.fp16_groups[i]) for p, q in zip(self.fp16_groups[i], updated_params): p.data.copy_(q.data) - + self.has_executed_step = True self.timers(UPDATE_FP16_TIMER).stop() self.timers.log(STEP_TIMERS) diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py index c55f8a0e2995..7744b2ee8b98 100755 --- a/deepspeed/runtime/utils.py +++ b/deepspeed/runtime/utils.py @@ -17,7 +17,6 @@ import torch from deepspeed import comm as dist - try: from torch._six import inf except ModuleNotFoundError: @@ -385,7 +384,7 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None): return total_norm -def get_grad_norm(parameters, norm_type=2, mpu=None): +def get_flattened_grad_norm(parameters, norm_type=2, mpu=None, grad_norm_mask=None): """Get grad norm of an iterable of parameters. This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and @@ -397,7 +396,8 @@ def get_grad_norm(parameters, norm_type=2, mpu=None): single Tensor that will have gradients normalized norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm. - + grad_norm_mask (List[Tensor]): A list of Tensor, where + each Tensor is a 2D Tensor containing ranges of [start_index, end_index]. Returns: Total norm of the parameters (viewed as a single vector). """ @@ -415,18 +415,25 @@ def get_grad_norm(parameters, norm_type=2, mpu=None): total_norm = total_norm_cuda[0].item() else: total_norm = 0. - tensor_mp_rank = bwc_tensor_model_parallel_rank(mpu=mpu) - for p in parameters: - # Pipeline parallelism may replicate parameters. Avoid multi-counting. - if hasattr(p, PIPE_REPLICATED) and p.ds_pipe_replicated: - continue - - # Filter to avoid over-counting replicated tensors from tensor - # model parallelism - if (tensor_mp_rank > 0) and not is_model_parallel_parameter(p): - continue + for idx, p in enumerate(parameters): + # Use grad_norm_mask to avoid redundant computation of flattened gradient norm + if grad_norm_mask is not None and len(grad_norm_mask[idx]) > 0: + + # A loop-free implementation to create a mask tensor based on a range list + # which is logically equivalent to the following implementation. + # # mask_tensor_ = torch.zeros_like(p, device=p.device, dtype=bool) + # # for mask_idx in grad_norm_mask[idx]: + # # mask_tensor_[mask_idx[0]:mask_idx[1]] = True + cum_sum_pairs = torch.tensor([1, -1], device=get_accelerator().current_device(), + dtype=p.dtype).repeat(grad_norm_mask[idx].shape[0], 1) + mask_tensor = torch.zeros(p.shape[0] + 1, device=get_accelerator().current_device(), dtype=p.dtype) + mask_tensor = mask_tensor.scatter_(0, grad_norm_mask[idx].view(-1), + cum_sum_pairs.view(-1)).cumsum(0).bool()[:-1] + + param_norm = torch.masked_fill(p.grad.data, mask_tensor, 0).float().norm(norm_type) - param_norm = p.grad.data.float().norm(norm_type) + else: + param_norm = p.grad.data.float().norm(norm_type) total_norm += param_norm.item()**norm_type # Sum across all model parallel GPUs. @@ -814,25 +821,6 @@ def get_only_unique_item(items): return unique_item -def clip_gradients(parameters, max_norm=1.0, global_grad_norm=None, mpu=None, eps=1e-6): - """Clip the gradient of a list of parameters. - Args: - parameters: List of parameters whose .grad will be clipped. - global_grad_norm (float, optional): Precomputed gradient norm. Defaults to None. - mpu (optional): model parallelism unit. Defaults to None. - eps (float, optional): epsilon value added to grad norm. Defaults to 1e-6 - Returns: - float: the global gradient norm - """ - if global_grad_norm is None: - global_grad_norm = get_grad_norm(parameters, mpu=mpu) - clip_coef = max_norm / (global_grad_norm + eps) - if clip_coef < 1: - for p in parameters: - p.grad.detach().mul_(clip_coef) - return global_grad_norm - - def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None, use_graph=False, moe_ep_group=None): """Get norm of an iterable of tensors. From 34969d2091fcfd37545361111d7ce7d13229d1a3 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Tue, 16 Apr 2024 12:27:45 -0700 Subject: [PATCH 19/36] Update 'create-pr' action in release workflow to latest (#5415) A [warning is shown](https://github.com/microsoft/DeepSpeed/actions/runs/8695213322/job/23845782048#step:10:31) when we do releases: ``` [deploy](https://github.com/microsoft/DeepSpeed/actions/runs/8695213322/job/23845782048) Node.js 16 actions are deprecated. Please update the following actions to use Node.js 20: peter-evans/create-pull-request@v4. For more information see: https://github.blog/changelog/2023-09-22-github-actions-transitioning-from-node-16-to-node-20/. ``` To resolve this we update the create a pull request to `@v6`, see release notes [here](https://github.com/peter-evans/create-pull-request/releases) --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5a931125eff6..2f571a14b228 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -35,7 +35,7 @@ jobs: run: | python release/bump_patch_version.py --current_version ${{ env.RELEASE_VERSION }} - name: Create Pull Request - uses: peter-evans/create-pull-request@v4 + uses: peter-evans/create-pull-request@v6 with: token: ${{ secrets.GH_PAT }} add-paths: | From bc0f77472828323fdc0ae67f62123948bc2b12d1 Mon Sep 17 00:00:00 2001 From: "Etienne.bfx" Date: Tue, 16 Apr 2024 21:43:44 +0200 Subject: [PATCH 20/36] Update engine.py to avoid torch warning (#5408) The state_dict function of module.py from torch write a warning if arguments are positional arguments and not keyword arguments --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: ebonnafoux --- deepspeed/runtime/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 992d7877c179..9a2b943b0992 100644 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -2542,7 +2542,7 @@ def all_gather_scalar(self, value, dp_group): return tensor_list def module_state_dict(self, destination=None, prefix="", keep_vars=False, exclude_frozen_parameters=False): - sd = self.module.state_dict(destination, prefix, keep_vars) + sd = self.module.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars) # Remove frozen parameter weights from state_dict if specified if exclude_frozen_parameters: From a9cbd688f01c7742397f53144b2223f9956540ac Mon Sep 17 00:00:00 2001 From: Shafiq Jetha <1066864+fasterinnerlooper@users.noreply.github.com> Date: Tue, 16 Apr 2024 14:49:35 -0600 Subject: [PATCH 21/36] Update _sidebar.scss (#5293) The right sidebar disappears off of the right side of the page. These changes will help bring the content back and place it correctly on the page. Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- docs/_sass/minimal-mistakes/_sidebar.scss | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/_sass/minimal-mistakes/_sidebar.scss b/docs/_sass/minimal-mistakes/_sidebar.scss index 63cef338c583..312a6279b9b0 100644 --- a/docs/_sass/minimal-mistakes/_sidebar.scss +++ b/docs/_sass/minimal-mistakes/_sidebar.scss @@ -76,10 +76,9 @@ @include breakpoint($large) { position: absolute; - top: 0; + top: auto; right: 0; width: $right-sidebar-width-narrow; - margin-right: -1.5 * $right-sidebar-width-narrow; padding-left: 1em; z-index: 10; @@ -94,7 +93,6 @@ @include breakpoint($x-large) { width: $right-sidebar-width; - margin-right: -1.5 * $right-sidebar-width; } } From 4d4c99d9e51afa01a8f4fc7d59a0bc09128abf3e Mon Sep 17 00:00:00 2001 From: Liangliang-Ma Date: Thu, 18 Apr 2024 00:48:28 +0800 Subject: [PATCH 22/36] Add more tests into XPU CI (#5427) After #5412 merged. We can add more tests into the CI. --- .github/workflows/xpu-max1100.yml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml index f83c58dc7ff5..34325b4522d7 100644 --- a/.github/workflows/xpu-max1100.yml +++ b/.github/workflows/xpu-max1100.yml @@ -61,14 +61,28 @@ jobs: - name: Unit tests run: | - pip install pytest pytest-timeout tabulate + pip install pytest pytest-timeout tabulate tensorboard wandb + export ONEAPI_ROOT=/opt/intel/oneapi/redist + export FI_PROVIDER_PATH=$ONEAPI_ROOT/opt/mpi/libfabric/lib/prov + export LD_LIBRARY_PATH=$ONEAPI_ROOT/opt/mpi/libfabric/lib:$LD_LIBRARY_PATH + export LD_LIBRARY_PATH=$ONEAPI_ROOT/lib:$LD_LIBRARY_PATH cd tests/unit pytest --verbose accelerator/* pytest --verbose autotuning/* pytest --verbose checkpoint/test_reshape_checkpoint.py + pytest --verbose checkpoint/test_moe_checkpoint.py + pytest --verbose checkpoint/test_shared_weights.py pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py + pytest --verbose moe/test_moe_tp.py + pytest --verbose monitor/* pytest --verbose runtime/test_ds_config_model.py pytest --verbose runtime/pipe/test_pipe_schedule.py pytest --verbose runtime/zero/test_zero_config.py pytest --verbose runtime/zero/test_zero_tiled.py pytest --verbose runtime/zero/test_zeropp.py + pytest --verbose runtime/test_autocast.py + pytest --verbose runtime/test_data.py + pytest --verbose runtime/test_runtime_utils.py + pytest --verbose runtime/activation_checkpointing/* + pytest --verbose runtime/utils/* + pytest --verbose runtime/zero/test_zero_dynamic_class.py From b22706a7211366abf2df98a0d118ea1d3a837e21 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Thu, 18 Apr 2024 02:52:36 +0800 Subject: [PATCH 23/36] [CPU] Support SHM based inference_all_reduce in TorchBackend (#5391) This PR adds SHM based `inference_all_reduce` kernel to `TorchBackend` communication backend. When inference on CPU server, this path replaces default `torch.distributed.all_reduce` which eventurally use gloo backend. This PR will improve inference performance with AutoTP when only stock PyTorch is installed without Intel Extension for PyTorch. Compared with gloo backend. SHM based inference_all_reduce kernel is a more directed path and perform much better on single node. | message size | gloo all_reduce(ms) | SHM all_reduce(ms) | | --- | --- | --- | | 32MB | 30.7 | 0.65 | | 64KB | 0.23 | 0.028 | In text generation of bloom-3b with AutoTP, average token latency improved 1.45x with this PR on 2S Xeon node. --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Olatunji Ruwase --- accelerator/cpu_accelerator.py | 6 +- csrc/cpu/comm/ccl.cpp | 2 +- csrc/cpu/comm/shm_interface.cpp | 120 ++++++++++++++++++++++++++++++++ deepspeed/comm/ccl.py | 6 +- deepspeed/comm/comm.py | 2 +- deepspeed/comm/torch.py | 20 +++++- op_builder/cpu/__init__.py | 2 +- op_builder/cpu/comm.py | 27 +++++++ 8 files changed, 174 insertions(+), 11 deletions(-) create mode 100644 csrc/cpu/comm/shm_interface.cpp diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index 870d3e91816e..a0171723cfb8 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -300,12 +300,14 @@ def get_op_builder(self, class_name): # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed # if successful this also means we're doing a local install and not JIT compile path from op_builder import __deepspeed__ # noqa: F401 # type: ignore - from op_builder.cpu import CCLCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder + from op_builder.cpu import CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder except ImportError: - from deepspeed.ops.op_builder.cpu import CCLCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder + from deepspeed.ops.op_builder.cpu import CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder if class_name == "CCLCommBuilder": return CCLCommBuilder + elif class_name == "ShareMemCommBuilder": + return ShareMemCommBuilder elif class_name == "FusedAdamBuilder": return FusedAdamBuilder elif class_name == "CPUAdamBuilder": diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index 786906717f23..d28509e59266 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -247,7 +247,7 @@ void all_reduce_caching(torch::Tensor& data, .wait()); } -void inference_all_reduce(torch::Tensor& data, py::object op, bool async_op) +void inference_all_reduce(torch::Tensor& data, py::object op) { #ifdef DO_PROFILE static double total_time = 0.0; diff --git a/csrc/cpu/comm/shm_interface.cpp b/csrc/cpu/comm/shm_interface.cpp new file mode 100644 index 000000000000..981ea36515fc --- /dev/null +++ b/csrc/cpu/comm/shm_interface.cpp @@ -0,0 +1,120 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +#include + +#include "shm.h" + +// #define DO_PROFILE +#ifdef DO_PROFILE +#include +#include +#endif + +// Communication settings +static int world_rank = -1; +static int world_size = -1; + +static bool is_initialized = 0; + +static bool all_ranks_local_p = false; + +void initialize(int size, int rank) +{ + if (is_initialized) return; + + // Check whether all ranks is on the same physical machine. + // If true, we will use an SHM based low latency allreduce + + auto ls_string = std::getenv("LOCAL_SIZE"); + int ls = 0; + if (ls_string != NULL) { ls = std::stoi(std::getenv("LOCAL_SIZE")); } + + if (size >= 1 && size == ls) { all_ranks_local_p = true; } + + world_size = size; + world_rank = rank; + is_initialized = 1; + + auto addr_string = std::getenv("MASTER_ADDR"); + if (addr_string == NULL) { addr_string = ""; } + auto port_string = std::getenv("MASTER_PORT"); + if (port_string == NULL) { port_string = ""; } + + if (all_ranks_local_p) { shm_initialize(size, rank, addr_string, port_string); } +} + +int get_rank(int group = 0) { return world_rank; } + +int get_world_size(int group = 0) { return world_size; } + +// Success - return 0 +// Fail (cannot hornor the request and need to fall back) - return -1 +int inference_all_reduce(torch::Tensor& data, py::object op) +{ + if (!all_ranks_local_p) return -1; +#ifdef DO_PROFILE + static double total_time = 0.0; + static double total_time_sq = 0.0; + static int count = -16; // warmup + static double max_time = 0.0; + static double min_time = DBL_MAX; + // make sure all rank reach this point before measuring time + // turn on this if you suspect each rank didn't reach here at the same time (stragger) + // if (all_ranks_local_p) { barrier_wait(0, world_size); } + auto start = std::chrono::system_clock::now(); +#endif + + static py::object ReduceOp = py::module_::import("deepspeed.comm").attr("ReduceOp"); + static auto ReduceOpSum = (int)py::int_(ReduceOp.attr("SUM").attr("value")); + + assert(py::int_(op.attr("value")) == ReduceOpSum); + + auto numel = data.numel(); + + int data_size = 0; + bool data_type_fallback = false; + + switch (data.scalar_type()) { + case c10::ScalarType::BFloat16: data_size = numel * 2; break; + case c10::ScalarType::Float: data_size = numel * 4; break; + default: data_type_fallback = true; + } + + if (data_type_fallback) return -1; + + all_reduce_outer_loop(data, numel, data_size); + +#ifdef DO_PROFILE + auto end = std::chrono::system_clock::now(); + count++; + if (count > 0) { + double elapsed = std::chrono::duration_cast(end - start).count(); + if (elapsed > max_time) { max_time = elapsed; } + if (elapsed < min_time) { min_time = elapsed; } + total_time += elapsed; + total_time_sq += elapsed * elapsed; + if (world_rank == 0 && count == 1000) { + auto avg = total_time / count; + auto sd = + sqrt(total_time_sq / count - total_time * total_time / (count * count)) / avg * 100; + printf(" C++ kernel\t\t %.2f\t %.2f\t%.2f\t %.2f\n", + min_time, + max_time, + total_time / count, + sd); + } + } +#endif + return 0; +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("initialize", &initialize, "shm initialize"); + m.def("get_rank", &get_rank, "get rank"); + m.def("get_world_size", &get_world_size, "get world size"); + m.def("inference_all_reduce", &inference_all_reduce, "low latency all_reduce implementation"); +} diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py index a199843d5086..f2a6cb6a36dc 100644 --- a/deepspeed/comm/ccl.py +++ b/deepspeed/comm/ccl.py @@ -98,12 +98,12 @@ def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False): else: return self.run_collective(name=name, tensor=tensor, op=op, group=group, async_op=async_op) - def inference_all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False): + def inference_all_reduce(self, tensor, op=ReduceOp.SUM, group=None): name = "inference_all_reduce" if name in self.available_coll: - return self.ccl_comm_op.inference_all_reduce(tensor, op, async_op) + return self.ccl_comm_op.inference_all_reduce(tensor, op) else: - return self.run_collective(name=name, tensor=tensor, op=op, group=None, async_op=async_op) + return self.run_collective(name=name, tensor=tensor, op=op, group=None, async_op=False) def broadcast(self, tensor, src, group=None, async_op=False): return self.run_collective(name="broadcast", tensor=tensor, src=src, group=group, async_op=async_op) diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py index c92ae74375f1..4f826edab3d6 100644 --- a/deepspeed/comm/comm.py +++ b/deepspeed/comm/comm.py @@ -505,7 +505,7 @@ def inference_all_reduce(tensor, log_name='all_reduce', debug=get_caller_func()): global cdb - return cdb.inference_all_reduce(tensor, op, group, async_op) + return cdb.inference_all_reduce(tensor, op, group) @timed_op diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py index 35c31f9fe2ec..14d4f3847315 100644 --- a/deepspeed/comm/torch.py +++ b/deepspeed/comm/torch.py @@ -3,6 +3,7 @@ # DeepSpeed Team +import deepspeed from deepspeed import utils from .utils import * @@ -19,6 +20,15 @@ DS_COMM_REDUCE_OFF = False +def build_shm_op(): + builder = get_accelerator().create_op_builder("ShareMemCommBuilder") + if builder is None or not deepspeed.ops.__compatible_ops__[builder.NAME]: + return None + shm_cpp_module = builder.load() + print(f'DeepSpeed {builder.absolute_name()} built successfully') + return shm_cpp_module + + def has_coalescing_manager(): has_c10d = hasattr(torch.distributed, 'distributed_c10d') return has_c10d and hasattr(torch.distributed.distributed_c10d, '_coalescing_manager') @@ -88,6 +98,7 @@ class TorchBackend(Backend): def __init__(self, backend, timeout, init_method, rank=-1, world_size=-1, name='torch'): super(TorchBackend, self).__init__() + self.shm_comm_op = build_shm_op() self.has_all_reduce_coalesced = has_all_reduce_coalesced() self.has_coalescing_manager = has_coalescing_manager() self.all_gather_function = self.get_all_gather_function() @@ -99,6 +110,8 @@ def __init__(self, backend, timeout, init_method, rank=-1, world_size=-1, name=' # it is not so we can run on a single GPU without doing any init_process_group self.single_gpu_mode = True self.init_process_group(backend, timeout, init_method, rank, world_size) + if self.shm_comm_op != None: + self.shm_comm_op.initialize(self.get_world_size(), self.get_rank()) @classmethod @compiler.disable @@ -139,9 +152,10 @@ def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, asyn return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op) @compiler.disable - def inference_all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False): - op = self._reduce_op(op) - return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op) + def inference_all_reduce(self, tensor, op, group=None): + if self.shm_comm_op == None or self.shm_comm_op.inference_all_reduce(tensor, op) == -1: + op = self._reduce_op(op) + return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=False) @compiler.disable def all_reduce_coalesced(self, tensors, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False): diff --git a/op_builder/cpu/__init__.py b/op_builder/cpu/__init__.py index 0ecf2caf65ae..30238add3f90 100644 --- a/op_builder/cpu/__init__.py +++ b/op_builder/cpu/__init__.py @@ -4,7 +4,7 @@ # DeepSpeed Team '''Copyright The Microsoft DeepSpeed Team''' -from .comm import CCLCommBuilder +from .comm import CCLCommBuilder, ShareMemCommBuilder from .fused_adam import FusedAdamBuilder from .cpu_adam import CPUAdamBuilder from .no_impl import NotImplementedBuilder diff --git a/op_builder/cpu/comm.py b/op_builder/cpu/comm.py index 38e965530f43..69b9226fd806 100644 --- a/op_builder/cpu/comm.py +++ b/op_builder/cpu/comm.py @@ -42,3 +42,30 @@ def extra_ldflags(self): return [] else: return ['-lccl', f'-L{ccl_root_path}/lib'] + + +class ShareMemCommBuilder(CPUOpBuilder): + BUILD_VAR = "DS_BUILD_SHM_COMM" + NAME = "deepspeed_shm_comm" + + def __init__(self, name=None): + name = self.NAME if name is None else name + super().__init__(name=name) + + def absolute_name(self): + return f'deepspeed.ops.comm.{self.NAME}_op' + + def sources(self): + return ['csrc/cpu/comm/shm_interface.cpp', 'csrc/cpu/comm/shm.cpp'] + + def include_paths(self): + includes = ['csrc/cpu/includes'] + return includes + + def cxx_args(self): + return ['-O2', '-fopenmp'] + + def is_compatible(self, verbose=True): + # TODO: add soft compatibility check for private binary release. + # a soft check, as in we know it can be trivially changed. + return super().is_compatible(verbose) From 3194fe85c5b62940f5479deba0396121f2d87d59 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Thu, 18 Apr 2024 09:27:07 -0700 Subject: [PATCH 24/36] Add required paths to trigger AMD tests on PRs (#5406) Even though AMD tests are currently broken, this will at least trigger them on PRs that touch files that might impact them. Since the test name is listed as `amd-tests` rather than `unit-tests` they will currently not be required, however. Co-authored-by: root --- .github/workflows/amd-mi200.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml index 00ff72ac8929..cd1cafe8e679 100644 --- a/.github/workflows/amd-mi200.yml +++ b/.github/workflows/amd-mi200.yml @@ -2,6 +2,10 @@ name: amd-mi200 on: workflow_dispatch: + pull_request: + paths: + - '.github/workflows/amd-mi200.yml' + - 'requirements/**' schedule: - cron: "0 0 * * *" From aaaf8bc5e07535e263f83733f8905400bf6f5aca Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Thu, 18 Apr 2024 20:39:07 +0200 Subject: [PATCH 25/36] Bug fix in `split_index` method (#5292) Bug description: on a dataset of 20 samples, when running 4 workers with 8 threads per worker, then the `split_dataset` would return for worker id `1`: ``` self.worker_splits [[0, 5], [5, 10], [10, 15], [15, 20]] self.thread_splits [[5, 6], [6, 7], [7, 8], [8, 9], [9, 10], [10, 10], [11, 10], [12, 10]] ``` `thread_splits` is wrong and causes a crash in the `DataAnalyzer`: the end sample id is lower than the initial one on the last 2 threads. This PR fixes that by fixing the behaviour of `split_index` --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- deepspeed/runtime/data_pipeline/data_sampling/utils.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/utils.py b/deepspeed/runtime/data_pipeline/data_sampling/utils.py index 9c643f3705de..dc55f96e222d 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/utils.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/utils.py @@ -3,7 +3,6 @@ # DeepSpeed Team -import math import numpy as np from deepspeed.utils import logger @@ -32,10 +31,8 @@ def find_fit_int_dtype(min_value, max_value): def split_index(start_idx, end_idx, num_partitions): - partition_size = math.ceil((end_idx - start_idx) / num_partitions) - partitions = [[start_idx + x * partition_size, - min(end_idx, start_idx + (x + 1) * partition_size)] for x in range(num_partitions)] - return partitions + partition_boundaries = np.linspace(start_idx, end_idx, dtype=int, num=num_partitions + 1) + return [(partition_boundaries[i], partition_boundaries[i + 1]) for i in range(num_partitions)] def split_dataset(dataset, num_workers, worker_id, num_threads): From 64defe65b73f856466935befc49fb188756ab558 Mon Sep 17 00:00:00 2001 From: Bruno Magalhaes Date: Thu, 18 Apr 2024 23:14:08 +0200 Subject: [PATCH 26/36] Parallel map step for `DistributedDataAnalyzer` map-reduce (#5291) - adds multi CPU-processing to the `DistributedDataAnalyzer` map operation (parallelism set with parameter `num_workers`). Works with a `SharedMemory` / `Manager's` queue per metric, written concurrently by processes. - much faster `write_buffer_to_file` in `DistributedDataAnalyzer` reduce operation by copying to cpu and "detaching" output tensor. --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Conglong Li --- .../data_sampling/data_analyzer.py | 128 ++++++++++++------ .../data_sampling/indexed_dataset.py | 6 +- 2 files changed, 93 insertions(+), 41 deletions(-) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index 7088df223bd8..93d351169834 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -4,6 +4,7 @@ # DeepSpeed Team import os +import sys from collections import defaultdict import csv import time @@ -12,8 +13,8 @@ import torch from torch.utils.data import BatchSampler, SequentialSampler, DataLoader, Subset -from deepspeed.utils import logger import deepspeed.comm as dist +from deepspeed.utils import logger from deepspeed.runtime.data_pipeline.data_sampling.indexed_dataset import MMapIndexedDataset, valid_dtypes from deepspeed.runtime.data_pipeline.data_sampling.utils import split_dataset, split_index, create_mmap_dataset_builder, close_mmap_dataset_builder, find_fit_int_dtype @@ -457,6 +458,7 @@ def __init__( self, dataset, num_workers=1, + num_threads=1, worker_id=0, batch_size=1, metric_names=[], @@ -477,6 +479,8 @@ def __init__( self.collate_fn = collate_fn self.device = device self.sample_indices = sample_indices + self.num_threads = num_threads + self.worker_id = worker_id if not dist.is_initialized(): dist.init_distributed() @@ -494,13 +498,9 @@ def __init__( if self.worker_id == 0: logger.info(f"Distributed data analyzer initialized with {self.num_workers} workers.") - def run_map_reduce(self): - - # setup individual dataloaders - worker_splits, _ = split_dataset(self.dataset, self.num_workers, self.worker_id, num_threads=1) - start_idx, end_idx = worker_splits[self.worker_id] - logger.info(f"worker {self.worker_id}: start working on data subset {start_idx} to {end_idx}") - worker_dataset = Subset(self.dataset, list(range(start_idx, end_idx))) + def run_map_helper(self, thread_id=0, metric_queues=None): + thread_start_idx, thread_end_idx = self.thread_splits[thread_id][0], self.thread_splits[thread_id][1] + worker_dataset = Subset(self.dataset, list(range(thread_start_idx, thread_end_idx))) sampler = BatchSampler(SequentialSampler(worker_dataset), batch_size=self.batch_size, drop_last=False) dataloader = DataLoader(dataset=worker_dataset, batch_sampler=sampler, @@ -516,7 +516,7 @@ def run_map_reduce(self): metric_results.append([] if metric_type == 'single_value_per_sample' else None) # iterate dataloader and store metric results - batch_start_idx = start_idx + batch_start_idx = thread_start_idx for data in dataloader: for m_idx in range(len(self.metric_names)): metric_type, metric_function = self.metric_types[m_idx], self.metric_functions[m_idx] @@ -544,15 +544,73 @@ def run_map_reduce(self): metric_results[m_idx].add_(metric_values) batch_start_idx += len(data) + if self.num_threads == 1: + return metric_results + + # copy metric_results to the shared queue + assert metric_queues + for m_idx in range(len(self.metric_names)): + results = metric_results[m_idx] + if torch.is_tensor(results): + results = results.item() if results.dim() == 0 else results.tolist() + try: + metric_queues[m_idx].put((thread_id, results)) + except Exception as e: + logger.error(f"Error putting metric results to queue: {e}") + sys.exit(1) + + def run_map_reduce(self): + + # setup individual dataloaders + self.worker_splits, self.thread_splits = split_dataset(self.dataset, + self.num_workers, + self.worker_id, + num_threads=self.num_threads) + node_start_idx, node_end_idx = self.worker_splits[self.worker_id] + logger.info(f"worker {self.worker_id} working on data subset {node_start_idx} to {node_end_idx}.") + + if self.num_threads in [0, 1, None]: + metric_results = self.run_map_helper() + metric_results = [torch.tensor(m).to(self.device) for m in metric_results] + else: + + # create a shared queue of results per metric to be populated by individual threads + with Manager() as manager: + metric_queues = [manager.Queue() for _ in self.metric_names] + threads = [ + Process(target=self.run_map_helper, args=(t, metric_queues)) for t in range(self.num_threads) + ] + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + # gather results from shared queues into metric_results + metric_results = [None for _ in self.metric_names] + for m_idx, (queue, metric_type) in enumerate(zip(metric_queues, self.metric_types)): + while not queue.empty(): + t_idx, t_results = queue.get() + t_start_idx, t_end_idx = self.thread_splits[t_idx] + if t_start_idx >= t_end_idx: # no results from this thread + continue #corner case for small datasets and high thread count + t_results = torch.tensor(t_results) + if metric_type == 'single_value_per_sample': + # add thread results to the metric_results list, ordered by thread idx + if metric_results[m_idx] is None: # initialize if needed + metric_results[m_idx] = torch.zeros(node_end_idx - node_start_idx, + t_results.size(1)).to(self.device) + metric_results[m_idx][t_start_idx - node_start_idx:t_end_idx - node_start_idx] = t_results + else: + if metric_results[m_idx] is None: # initialize if needed + metric_results[m_idx] = torch.zeros(t_results.size()).to(self.device) + metric_results[m_idx].add_(t_results) + # compute dtype for sample ids total_num_samples = len(self.dataset) sample_idx_dtype = find_fit_int_dtype(0, total_num_samples - 1) logger.info(f"Total number of data samples: {total_num_samples}.") logger.info(f"Will use {sample_idx_dtype} to store the sample indexes.") - # convert to list of tensors - metric_results = [torch.tensor(m).to(self.device) for m in metric_results] - for m_idx in range(len(self.metric_names)): metric_values, metric_name, metric_type = \ metric_results[m_idx], self.metric_names[m_idx], self.metric_types[m_idx] @@ -611,8 +669,8 @@ def run_map_reduce(self): def file_write_ordered(self, tensor_list, fname, numpy_dtype): """ MPI_file_write_ordered extended to write a list of tensors, by one rank, iteratively """ - # each not has a list of rows (tensors) to be written to the file. - # we will serialize it to communicate it in one comm step. + # each node has a list of rows (tensors) to be written to the file. + # we will serialize it in order to communicate it in one comm step. tkwargs = dict(dtype=torch.int64, device=self.device) @@ -636,17 +694,13 @@ def file_write_ordered(self, tensor_list, fname, numpy_dtype): def write_buffer_to_file(buff, src, builder): assert self.worker_id == 0, "only rank 0 can write to file" - # # write one buffer at a time - # for row_len in row_lens[src]: - # builder.add_item(buff[:row_len].cpu()) - # buff = buff[row_len:] - - # collect all buffers and write them all at once - buffer_list = [] - for row_len in row_lens[src]: - buffer_list.append(buff[:row_len].cpu()) - buff = buff[row_len:] - builder.add_items(buffer_list) + # collect all buffers and write them at once + buff = buff.cpu().detach().numpy() + row_offsets = np.cumsum([0] + row_lens[src].tolist()) + arr_list = [] + for i in range(len(row_lens[src])): + arr_list.append(buff[row_offsets[i]:row_offsets[i + 1]]) + builder.add_items(arr_list) # 5. rank 0 prepares output folder and file if self.worker_id == 0: @@ -700,7 +754,7 @@ def gather_v(tensor, dst, comm_group, num_workers, worker_id): # all_gather requires all tensors to be of same size so we need to pad them max_size = max(sizes).item() buffer = torch.empty(max_size, dtype=tensor.dtype, device=tensor.device) - buffer[0:size] = torch.tensor(tensor, dtype=tensor.dtype, device=tensor.device) + buffer[0:size] = tensor.data buffer_list = None if worker_id == 0: # create padded recv buffers buffer_list = [torch.empty(max_size, dtype=tensor.dtype, device=tensor.device) for _ in range(num_workers)] @@ -763,16 +817,18 @@ def sample_sort(tensor, comm_group, num_workers, n_samples=100): def test_compare_both_data_analyzers(dataset): """ given a dataset, compare file and memory based data analyser""" - id = lambda t: torch.tensor(t).to(torch.int64) # identity + id = lambda t: t.to(torch.int64) # identity batch_sum = lambda t: id(t).sum() #sum batch + num_threads = 4 kwargs = dict( dataset=dataset, - batch_size=3, + batch_size=2**10, worker_id=int(os.environ['RANK']), num_workers=int(os.environ['WORLD_SIZE']), metric_names=["mod", "batch_sum"], metric_functions=[id, batch_sum], metric_types=['single_value_per_sample', 'accumulate_value_over_samples'], + num_threads=num_threads, ) dda = DistributedDataAnalyzer( @@ -785,10 +841,9 @@ def test_compare_both_data_analyzers(dataset): if dda.worker_id == 0: print("DistributedDataAnalyzer runtime: %s seconds " % (time.time() - start_time)) - da = DataAnalyzer(num_threads=2, - num_threads_reduce=2, - metric_dtypes=[torch.int64, torch.int64], + da = DataAnalyzer(num_threads_reduce=num_threads, save_path="./output_disk", + metric_dtypes=[torch.int64, torch.int64], **kwargs) start_time = time.time() da.run_map_reduce() @@ -815,14 +870,11 @@ def test_compare_both_data_analyzers(dataset): class TestDataset(torch.utils.data.Dataset): - def __init__(self, size=20): - self.values = [1001 + x % 6 for x in range(size)] + def __init__(self, size=10_000_000): + self.values = [(x + 7) % 10_000 for x in range(size)] self.size = size - def __len__(self): - return self.size - - def __getitem__(self, idx): - return self.values[idx] + __len__ = lambda self: self.size + __getitem__ = lambda self, idx: self.values[idx] test_compare_both_data_analyzers(TestDataset()) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py index 453e6ba6039d..872d05de0145 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py @@ -586,9 +586,9 @@ def add_item(self, tensor): self._data_file.write(np_array.tobytes(order='C')) self._sizes.append(np_array.size) - def add_items(self, tensor_list): - """ write a list of tensors to the file and update their sizes in the index""" - np_arrays = [np.array(t.numpy(), dtype=self._dtype) for t in tensor_list] + def add_items(self, arr_list): + """ write a list of arrays to the file and update their sizes in the index""" + np_arrays = [arr.astype(self._dtype) for arr in arr_list] self._data_file.writelines([arr.tobytes(order='C') for arr in np_arrays]) for arr in np_arrays: self._sizes.append(arr.size) From c632ea09f8d107d10f76aa2b776e4df3c1ccf98a Mon Sep 17 00:00:00 2001 From: Reza Yazdani <44502768+RezaYazdaniAminabadi@users.noreply.github.com> Date: Fri, 19 Apr 2024 08:58:27 -0700 Subject: [PATCH 27/36] Selective dequantization (#5375) This PR adds a new functionality for the dequantizer function, called `selective_dequantize`, which enables partially dequantizing a 3-dimensional matrix in case we don't need to dequantize all the data from lower bit (like fp8/fp6) to bf16. I also added a unit test to check its functionality. --------- Co-authored-by: Reza Yazdani Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- csrc/fp_quantizer/includes/quantize.h | 11 ++ csrc/fp_quantizer/quantize.cpp | 31 +++++ csrc/fp_quantizer/quantize.cu | 135 +++++++++++++++++++ deepspeed/ops/fp_quantizer/quantize.py | 35 +++++ tests/unit/ops/fp_quantizer/test_fp_quant.py | 29 ++++ 5 files changed, 241 insertions(+) diff --git a/csrc/fp_quantizer/includes/quantize.h b/csrc/fp_quantizer/includes/quantize.h index 2204c1ba74fc..507252d6e722 100644 --- a/csrc/fp_quantizer/includes/quantize.h +++ b/csrc/fp_quantizer/includes/quantize.h @@ -113,3 +113,14 @@ void launch_dequantization(uint8_t* val, int q_mantisa_bits, int q_exponent_bits, cudaStream_t stream); + +template +void launch_selective_dequantization(uint8_t* val, + T* q_val, + int32_t* indexes, + int num_groups, + int group_size, + int num_indexes, + int q_mantisa_bits, + int q_exponent_bits, + cudaStream_t stream); diff --git a/csrc/fp_quantizer/quantize.cpp b/csrc/fp_quantizer/quantize.cpp index 4a88ff767636..ec631c576e27 100644 --- a/csrc/fp_quantizer/quantize.cpp +++ b/csrc/fp_quantizer/quantize.cpp @@ -78,8 +78,39 @@ void dequantize(torch::Tensor& val, #endif } +#define DISPATCH_DEQUANTIZE_INDEX(T_TYPE, C_TYPE, mantisa) \ + if (val.options().dtype() == torch::T_TYPE) { \ + launch_selective_dequantization((uint8_t*)val_q.data_ptr(), \ + (C_TYPE*)val.data_ptr(), \ + (int32_t*)indexes.data_ptr(), \ + num_groups, \ + group_size, \ + num_indexes, \ + q_mantisa_bits, \ + q_exponent_bits, \ + at::cuda::getCurrentCUDAStream()); \ + return; \ + } +void selective_dequantize(torch::Tensor& val, + torch::Tensor& val_q, + torch::Tensor& indexes, + int group_size, + int q_mantisa_bits, + int q_exponent_bits) +{ + int total_elems = at::numel(val); + int num_indexes = indexes.size(0); + int num_groups = total_elems / group_size; + + DISPATCH_DEQUANTIZE_INDEX(kHalf, __half, 10); +#ifdef BF16_AVAILABLE + DISPATCH_DEQUANTIZE_INDEX(kBFloat16, __nv_bfloat16, 7); +#endif +} + PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("quantize", &quantize, "quantize function"); m.def("dequantize", &dequantize, "dequantize function"); + m.def("selective_dequantize", &selective_dequantize, "selective dequantize function"); } diff --git a/csrc/fp_quantizer/quantize.cu b/csrc/fp_quantizer/quantize.cu index 5f0b58f124f0..5ada6894747f 100644 --- a/csrc/fp_quantizer/quantize.cu +++ b/csrc/fp_quantizer/quantize.cu @@ -270,6 +270,7 @@ __global__ void apply_dequantization(uint8_t* val, T* q_val, int group_size, int mem_access::load_global( int8_data + quantization::quanitzed_access_granularity_6bits * 2, load_base_ptr + quantization::quanitzed_access_granularity_6bits * 2); + } else { mem_access::load_global(int8_data, load_base_ptr); @@ -393,3 +394,137 @@ void launch_dequantization(uint8_t* val, INSTANTIATE_LAUNCH_DEQUANTIZATION(__nv_bfloat16, 7); #endif INSTANTIATE_LAUNCH_DEQUANTIZATION(__half, 10); + +template +__global__ void apply_selective_dequantization(uint8_t* val, + T* q_val, + int32_t* indexes, + int group_size, + int total_num_elements) +{ + int index = indexes[blockIdx.x]; + constexpr uint32_t vector_size = quantization::access_granularity / sizeof(T); + int tidx = (blockIdx.y * blockDim.x + threadIdx.x) * vector_size; + int input_index = index * total_num_elements + tidx; + constexpr int quantized_bits = _mantisa_bits + _exponent_bits + 1; + constexpr int q_exponent_bits = total_q_bits - q_mantisa_bits - 1; + constexpr uint16_t _mantisa_mask = (1 << _mantisa_bits) - 1; + constexpr uint16_t _exponent_mask = ((1 << _exponent_bits) - 1) << _mantisa_bits; + constexpr uint16_t _sign_mask = 1 << (_mantisa_bits + _exponent_bits); + const uint32_t g_index = (input_index / group_size); + const uint32_t group_size_bytes = (group_size * quantized_bits / 8); + const uint8_t* load_base_ptr = + val + g_index * (group_size_bytes + 4) + (input_index % group_size) * quantized_bits / 8; + + int mantisa_mask = ((1 << q_mantisa_bits) - 1); + mantisa_mask <<= (_mantisa_bits - q_mantisa_bits); + + T* store_base_ptr = q_val + tidx + blockIdx.x * total_num_elements; + float scale; + + uint8_t* scale_as_int8 = reinterpret_cast(&scale); + if (quantized_bits == 6) { + mem_access::load_global( + scale_as_int8, val + g_index * (group_size_bytes + 4) + group_size_bytes); + mem_access::load_global( + scale_as_int8 + quantization::quanitzed_access_granularity_6bits, + val + g_index * (group_size_bytes + 4) + group_size_bytes + + quantization::quanitzed_access_granularity_6bits); + } else + mem_access::load_global( + scale_as_int8, val + g_index * (group_size_bytes + 4) + group_size_bytes); + + if (tidx < total_num_elements) { + uint64_t q_buf_in; + uint64_t q_buf_in1; + uint8_t* int8_data = reinterpret_cast(&q_buf_in); + uint8_t* int8_data1 = reinterpret_cast(&q_buf_in1); + if (quantized_bits == 6) { + mem_access::load_global( + int8_data, load_base_ptr); + mem_access::load_global( + int8_data + quantization::quanitzed_access_granularity_6bits, + load_base_ptr + quantization::quanitzed_access_granularity_6bits); + mem_access::load_global( + int8_data + quantization::quanitzed_access_granularity_6bits * 2, + load_base_ptr + quantization::quanitzed_access_granularity_6bits * 2); + } else { + mem_access::load_global(int8_data, + load_base_ptr); + if (quantized_bits > 4) { + mem_access::load_global( + int8_data + quantization::quanitzed_access_granularity, + load_base_ptr + quantization::quanitzed_access_granularity); + if (quantized_bits == 12) { + mem_access::load_global( + int8_data1, load_base_ptr + quantization::quanitzed_access_granularity * 2); + } + } + } + T store_buf[vector_size]; + uint16_t* q_buf = reinterpret_cast(store_buf); +#pragma unroll + for (int j = 0; j < vector_size; j++) { + uint16_t new_data; + if (j < 5 || quantized_bits != 12) { + new_data = (uint16_t)(q_buf_in >> (j * quantized_bits)); + } else { + if (j == 5) { + new_data = (uint16_t)(q_buf_in1); + new_data = (uint16_t)((new_data << 4) | (q_buf_in >> 60)); + } else + new_data = (uint16_t)(q_buf_in1 >> ((j - 6) * quantized_bits + 8)); + } + + uint16_t sign = (new_data & _sign_mask) >> (_mantisa_bits + _exponent_bits); + uint16_t dst_exponent = (new_data & _exponent_mask) >> _mantisa_bits; + uint16_t dst_mantisa = (new_data & _mantisa_mask); + + if (dst_exponent != (1 << q_exponent_bits) - 1) + dst_exponent = (dst_exponent - ((1 << (_exponent_bits - 1)) - 1)) + + (1 << (q_exponent_bits - 1)) - 1; + + q_buf[j] = + ((sign << (q_exponent_bits + q_mantisa_bits)) | (dst_exponent << q_mantisa_bits) | + (dst_mantisa << (q_mantisa_bits - _mantisa_bits))); + float up_cast = conversion::to(store_buf[j]); + store_buf[j] = conversion::to(up_cast * scale); + } + mem_access::store_global(store_base_ptr, store_buf); + } +} + +template +void launch_selective_dequantization(uint8_t* val, + T* q_val, + int32_t* indexes, + int num_groups, + int group_size, + int num_indexes, + int q_mantisa_bits, + int q_exponent_bits, + cudaStream_t stream) +{ + int total_elements_per_index = (num_groups / num_indexes) * group_size; + int blocks = (total_elements_per_index - 1) / + (quantization::threads * (quantization::access_granularity / sizeof(T))) + + 1; + const dim3 grid(num_indexes, blocks); + const dim3 block(quantization::threads); + DEQUANT_SWITCH(q_mantisa_bits * q_exponent_bits, [&] { + apply_selective_dequantization + <<>>(val, q_val, indexes, group_size, total_elements_per_index); + }); +} +#define INSTANTIATE_LAUNCH_SELECTIVE_DEQUANTIZATION(T, mantisa) \ + template void launch_selective_dequantization( \ + uint8_t*, T*, int32_t*, int, int, int, int, int, cudaStream_t); +// fp8(E4M3) +#ifdef BF16_AVAILABLE +INSTANTIATE_LAUNCH_SELECTIVE_DEQUANTIZATION(__nv_bfloat16, 7); +#endif +INSTANTIATE_LAUNCH_SELECTIVE_DEQUANTIZATION(__half, 10); diff --git a/deepspeed/ops/fp_quantizer/quantize.py b/deepspeed/ops/fp_quantizer/quantize.py index 5dc3c190ae5d..0d4bf7bc6db1 100644 --- a/deepspeed/ops/fp_quantizer/quantize.py +++ b/deepspeed/ops/fp_quantizer/quantize.py @@ -77,3 +77,38 @@ def dequantize(self, input_q, fp_out=None, q_bits=8, q_mantisa_bits=3, scale=Non fp_quant_module.dequantize(fp_out, input_q, self.group_size, q_mantisa_bits, q_bits - q_mantisa_bits - 1) return fp_out + + def selective_dequantize(self, + input_q, + indexes, + fp_out=None, + q_bits=8, + q_mantisa_bits=3, + scale=None) -> torch.Tensor: + assert (not hasattr(self, 'orig_shape') or len(self.orig_shape) == 3), \ + "Selective-Dequantization works on 3d tensor only! Please reshape the tensor before calling dequantize function." + assert (self.orig_dtype is not None), \ + "[De-quantization Error]: you need to call quantize before dequantizing!" + fp_out = torch.empty( + (indexes.shape[0], + *self.orig_shape[1:]), dtype=self.orig_dtype, device=input_q.device) if fp_out is None else fp_out + if q_bits == 8: + pass + elif q_bits == 12: + q_mantisa_bits = 4 + elif q_bits == 6: + q_mantisa_bits = 2 + elif q_bits == 4: + q_mantisa_bits = 1 + else: + assert (0), \ + f"Missing {q_bits}-dequantization, please add the template arguments for the kernel to support this precision!" + + if scale is not None: + assert input_q.numel() == fp_out.numel(), \ + f'[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!' + input_q = torch.cat([input_q.reshape(-1, self.group_size), scale], dim=-1).contiguous() + + fp_quant_module.selective_dequantize(fp_out, input_q, indexes, self.group_size, q_mantisa_bits, + q_bits - q_mantisa_bits - 1) + return fp_out diff --git a/tests/unit/ops/fp_quantizer/test_fp_quant.py b/tests/unit/ops/fp_quantizer/test_fp_quant.py index 101f4cd69811..bed8bd7e3bcc 100644 --- a/tests/unit/ops/fp_quantizer/test_fp_quant.py +++ b/tests/unit/ops/fp_quantizer/test_fp_quant.py @@ -61,6 +61,35 @@ def test_fp_quant_meta(dtype): assert 0.0004 > abs(qtorch_error.item() - ds_error.item()), f"failed on iteration {i}" +@pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"]) +def test_fp_quant_selective(dtype): + group_size = 128 + q_bits = 8 + exp_bits = 4 + man_bits = 3 + + fpq = FP_Quantize(group_size=group_size) + indexes = torch.zeros(2, dtype=torch.int32, device='cuda') + indexes[0] = 1 + indexes[1] = 3 + for i in range(10): + x = torch.rand(4, 1024, dtype=dtype, device='cuda') + + x = x.reshape(4, 1, x.shape[-1]) + ds_x = x.clone() + x_quantized = fpq.quantize(ds_x, q_bits=q_bits) + x_dequantized = fpq.selective_dequantize(x_quantized, indexes, q_bits=q_bits) + + qtorch_out = qtorch_quantize(x.index_select(0, indexes), + exp_bits=exp_bits, + man_bits=man_bits, + group_size=group_size) + qtorch_error = (qtorch_out - x.index_select(0, indexes)).abs().sum() / x.numel() + ds_error = (x_dequantized - x.index_select(0, indexes)).abs().sum() / x.numel() + + assert 0.0004 > abs(qtorch_error.item() - ds_error.item()), f"failed on iteration {i}" + + @pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"]) @pytest.mark.parametrize("q_bits", [8, 6, 12], ids=["qbits8", "qbits6", "qbits12"]) def test_fp_quant(dtype, q_bits): From 99951caa3d2155a3bb84109a0828543793e088cc Mon Sep 17 00:00:00 2001 From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com> Date: Fri, 19 Apr 2024 14:19:47 -0700 Subject: [PATCH 28/36] Fix sorting of shard optimizer states files for universal checkpoint (#5395) This PR resolves the issue reported in #5283. To resolve the issue, we sort files of sharded optimizer states based on DP indices. --------- Co-authored-by: Olatunji Ruwase --- deepspeed/checkpoint/ds_to_universal.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/deepspeed/checkpoint/ds_to_universal.py b/deepspeed/checkpoint/ds_to_universal.py index d5eca81c804f..63fa866718de 100755 --- a/deepspeed/checkpoint/ds_to_universal.py +++ b/deepspeed/checkpoint/ds_to_universal.py @@ -132,6 +132,10 @@ def extract_zero_shards(dir, ds_checkpoint, indices_3D): cnt = 0 +def dp_index_to_str(dp_index): + return f"{dp_index:0>2d}" + + def dump_param_fragment(dir, tp_index, dp_index, state_name, state_flat_tensor, param_name, offset, numel): global cnt # temp hack @@ -140,9 +144,8 @@ def dump_param_fragment(dir, tp_index, dp_index, state_name, state_flat_tensor, os.makedirs(param_base_path, exist_ok=True) cnt += 1 - counter = f"{dp_index:0>2d}" - path = os.path.join(param_base_path, f"{state_name}.{counter}") + path = os.path.join(param_base_path, f"{state_name}.{dp_index_to_str(dp_index)}") #print(f"{param_name}: {offset}: {numel} => {path}") @@ -156,10 +159,21 @@ def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape): slices = [] for tp_index in range(tp_degree): prefix_path = os.path.join(param_base_path, str(tp_index), f"{state}") - paths = sorted(list(glob.glob(f"{prefix_path}.*"))) + paths = glob.glob(f"{prefix_path}.*") + if len(paths) == 0: continue + pattern = re.compile(f"{prefix_path}\\.([0-9]+)") + dp_indices = set() + for p in paths: + m = pattern.match(p) + if m: + dp_indices.add(int(m.group(1))) + else: + raise ValueError(f"Cannot parse dp_rank from {p}") + + paths = [f"{prefix_path}.{dp_index_to_str(dp_index)}" for dp_index in sorted(list(dp_indices))] shards = [torch.load(p) for p in paths] if state == "step": From 3f875d95193fbd3a0c7f0c0dcc8d39469061bb66 Mon Sep 17 00:00:00 2001 From: shiyuan680 <72335504+shiyuan680@users.noreply.github.com> Date: Sun, 21 Apr 2024 07:35:50 +0800 Subject: [PATCH 29/36] add device config env for the accelerator (#5396) Thank you for [pr](https://github.com/microsoft/DeepSpeed/pull/5369) and @delock contribution of ideas. As mentioned in this [pr](https://github.com/microsoft/DeepSpeed/pull/5369), each device has its own environmental variables. We create visible_devices_envs() and set_visible_devices_envs() methods on the accelerator class to enable each accelerator to implement env settings within the interface , which is more generic to other accelerators. this commit has tested on npu, each one has 8 ascend npus --------- Co-authored-by: yangcheng Co-authored-by: eigen2017 Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Olatunji Ruwase --- README.md | 11 ++++++----- accelerator/abstract_accelerator.py | 8 ++++++++ accelerator/cpu_accelerator.py | 8 ++++++++ accelerator/cuda_accelerator.py | 7 +++++++ accelerator/hpu_accelerator.py | 7 +++++++ accelerator/mps_accelerator.py | 9 +++++++++ accelerator/npu_accelerator.py | 7 +++++++ accelerator/xpu_accelerator.py | 7 +++++++ deepspeed/launcher/launch.py | 11 +++++++---- 9 files changed, 66 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index a1335caa4949..201b9016f8ab 100755 --- a/README.md +++ b/README.md @@ -159,11 +159,12 @@ dynamically link them at runtime. ## Contributed HW support * DeepSpeed now support various HW accelerators. -| Contributor | Hardware | Accelerator Name | Contributor validated | Upstream validated | -| ----------- | -------- | ---------------- | --------------------- | ------------------ | -| Intel | Intel(R) Gaudi(R) 2 AI accelerator | hpu | Yes | Yes | -| Intel | Intel(R) Xeon(R) Processors | cpu | Yes | Yes | -| Intel | Intel(R) Data Center GPU Max series | xpu | Yes | Yes | +| Contributor | Hardware | Accelerator Name | Contributor validated | Upstream validated | +|-------------|-------------------------------------|------------------| --------------------- |--------------------| +| Huawei | Huawei Ascend NPU | npu | Yes | No | +| Intel | Intel(R) Gaudi(R) 2 AI accelerator | hpu | Yes | Yes | +| Intel | Intel(R) Xeon(R) Processors | cpu | Yes | Yes | +| Intel | Intel(R) Data Center GPU Max series | xpu | Yes | Yes | ## PyPI We regularly push releases to [PyPI](https://pypi.org/project/deepspeed/) and encourage users to install from there in most cases. diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py index 3c5d799e293e..768d5ea34e5e 100644 --- a/accelerator/abstract_accelerator.py +++ b/accelerator/abstract_accelerator.py @@ -287,3 +287,11 @@ def build_extension(self): @abc.abstractmethod def export_envs(self): ... + + @abc.abstractmethod + def visible_devices_envs(self): + ... + + @abc.abstractmethod + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + ... diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index a0171723cfb8..237e7f51dcb4 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -322,3 +322,11 @@ def build_extension(self): def export_envs(self): return [] + + # TODO: cpu's visible envs is confirmed, keep as CUDA_VISIBLE_DEVICES + def visible_devices_envs(self): + return ['CUDA_VISIBLE_DEVICES'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py index 3d5e9c168c16..2fc0cfd94125 100644 --- a/accelerator/cuda_accelerator.py +++ b/accelerator/cuda_accelerator.py @@ -360,3 +360,10 @@ def build_extension(self): def export_envs(self): return ['NCCL'] + + def visible_devices_envs(self): + return ['CUDA_VISIBLE_DEVICES'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py index 30b115e8b1ab..326efc8fa01b 100644 --- a/accelerator/hpu_accelerator.py +++ b/accelerator/hpu_accelerator.py @@ -294,3 +294,10 @@ def build_extension(self): def export_envs(self): return [] + + def visible_devices_envs(self): + return ['HABANA_VISIBLE_MODULES'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py index 972b33caece1..ff70b860d7c7 100644 --- a/accelerator/mps_accelerator.py +++ b/accelerator/mps_accelerator.py @@ -258,3 +258,12 @@ def build_extension(self): def export_envs(self): return [] + + # TODO: mpu's visible envs is confirmed, keep as CUDA_VISIBLE_DEVICES + def visible_devices_envs(self): + # TODO: could not find visible devices env for mps + return ['CUDA_VISIBLE_DEVICES'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py index 472157e32c02..5d891ecb707d 100644 --- a/accelerator/npu_accelerator.py +++ b/accelerator/npu_accelerator.py @@ -278,3 +278,10 @@ def build_extension(self): def export_envs(self): return ['ASCEND', 'HCCL', 'LD_LIBRARY', 'PATH'] + + def visible_devices_envs(self): + return ['ASCEND_RT_VISIBLE_DEVICES'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py index 3f65263946ab..c59f60077d2f 100644 --- a/accelerator/xpu_accelerator.py +++ b/accelerator/xpu_accelerator.py @@ -289,3 +289,10 @@ def build_extension(self): def export_envs(self): return [] + + def visible_devices_envs(self): + return ['ZE_AFFINITY_MASK'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py index ffb9cbc18fbd..079182a3b45b 100755 --- a/deepspeed/launcher/launch.py +++ b/deepspeed/launcher/launch.py @@ -22,6 +22,7 @@ from collections import defaultdict from typing import Dict from argparse import ArgumentParser, REMAINDER +from deepspeed.accelerator import get_accelerator from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT from ..nebula.constants import DLTS_POD_ENV_PATH from ..utils import logger, get_numactl_cmd @@ -146,8 +147,8 @@ def main(): node_list = list(world_info.keys()) args.nnodes = len(node_list) local_node = node_list[args.node_rank] - local_gpu_ids = world_info[local_node] - num_local_procs = len(local_gpu_ids) + local_accelerator_ids = world_info[local_node] + num_local_procs = len(local_accelerator_ids) logger.info(f"nnodes={args.nnodes}, num_local_procs={num_local_procs}, node_rank={args.node_rank}") global_rank_mapping = defaultdict(list) @@ -161,8 +162,10 @@ def main(): curr_global_rank += 1 logger.info(f"global_rank_mapping={global_rank_mapping}") logger.info(f"dist_world_size={dist_world_size}") - current_env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, local_gpu_ids)) - logger.info(f"Setting CUDA_VISIBLE_DEVICES={current_env['CUDA_VISIBLE_DEVICES']}") + + get_accelerator().set_visible_devices_envs(current_env, local_accelerator_ids) + for env in get_accelerator().visible_devices_envs(): + logger.info(f"Setting {env}={current_env[env]}") # set PyTorch distributed related environmental variables current_env["MASTER_ADDR"] = args.master_addr From 9b6ef9e1f0d8acaefd989440b27da9069aa69207 Mon Sep 17 00:00:00 2001 From: Wei Fu <36355462+garrett4wade@users.noreply.github.com> Date: Tue, 23 Apr 2024 03:47:00 +0800 Subject: [PATCH 30/36] 64bit indexing fused adam (#5187) ## The Issue Applying `FusedAdam` on large tensors will cause an error `CUDA error: an illegal memory access was encountered`. https://github.com/microsoft/DeepSpeed/issues/3429 https://github.com/NVIDIA/apex/issues/1654 ## PR Content Following the solution in the apex repository (https://github.com/NVIDIA/apex/pull/1765), changing indexing type to `int64` if necessary. --------- Co-authored-by: Michael Wyatt Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- csrc/adam/multi_tensor_adam.cu | 77 +++++++++++++++++++++++--------- csrc/adam/multi_tensor_apply.cuh | 10 ++--- 2 files changed, 61 insertions(+), 26 deletions(-) diff --git a/csrc/adam/multi_tensor_adam.cu b/csrc/adam/multi_tensor_adam.cu index d6b9b2f70710..a1fc7d15aec9 100644 --- a/csrc/adam/multi_tensor_adam.cu +++ b/csrc/adam/multi_tensor_adam.cu @@ -30,7 +30,7 @@ typedef enum : int { using MATH_T = float; -template +template struct AdamFunctor { __device__ __forceinline__ void operator()(int chunk_size, volatile int* noop_gmem, @@ -48,13 +48,13 @@ struct AdamFunctor { // if(*noop_gmem == 1) // return; - int tensor_loc = tl.block_to_tensor[blockIdx.x]; + index_t tensor_loc = tl.block_to_tensor[blockIdx.x]; // potentially use to pass in list of scalar // int tensor_num = tl.start_tensor_this_launch + tensor_loc; - int chunk_idx = tl.block_to_chunk[blockIdx.x]; - int n = tl.sizes[tensor_loc]; + index_t chunk_idx = tl.block_to_chunk[blockIdx.x]; + index_t n = tl.sizes[tensor_loc]; T* g = (T*)tl.addresses[0][tensor_loc]; g += chunk_idx * chunk_size; @@ -71,7 +71,8 @@ struct AdamFunctor { n -= chunk_idx * chunk_size; // see note in multi_tensor_scale_kernel.cu - for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) { + for (index_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * ILP) { MATH_T r_g[ILP]; MATH_T r_p[ILP]; MATH_T r_m[ILP]; @@ -146,23 +147,57 @@ void multi_tensor_adam_cuda(int chunk_size, bias_correction2 = 1 - std::pow(beta2, step); } + size_t max_size = 0; + bool requires_64bit_indexing = false; + for (auto it = tensor_lists.begin(); it != tensor_lists.end(); it++) { + for (auto it2 = it->begin(); it2 != it->end(); it2++) { + if (it2->numel() > max_size) { + max_size = it2->numel(); + if (max_size >= INT_MAX) { + requires_64bit_indexing = true; + break; + } + } + } + if (requires_64bit_indexing) { break; } + } + // Assume single type across p,g,m1,m2 now - DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), - 0, - "adam", - multi_tensor_apply<4>(BLOCK_SIZE, - chunk_size, - noop_flag, - tensor_lists, - AdamFunctor(), - beta1, - beta2, - bias_correction1, - bias_correction2, - epsilon, - lr, - (adamMode_t)mode, - weight_decay);) + if (requires_64bit_indexing) { + DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), + 0, + "adam", + multi_tensor_apply<4>((int64_t)BLOCK_SIZE, + (int64_t)chunk_size, + noop_flag, + tensor_lists, + AdamFunctor(), + beta1, + beta2, + bias_correction1, + bias_correction2, + epsilon, + lr, + (adamMode_t)mode, + weight_decay);) + } else { + DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), + 0, + "adam", + multi_tensor_apply<4>(BLOCK_SIZE, + chunk_size, + noop_flag, + tensor_lists, + AdamFunctor(), + beta1, + beta2, + bias_correction1, + bias_correction2, + epsilon, + lr, + (adamMode_t)mode, + weight_decay);) + } AT_CUDA_CHECK(cudaGetLastError()); } diff --git a/csrc/adam/multi_tensor_apply.cuh b/csrc/adam/multi_tensor_apply.cuh index 12f41cb49c6b..342376c141be 100644 --- a/csrc/adam/multi_tensor_apply.cuh +++ b/csrc/adam/multi_tensor_apply.cuh @@ -35,7 +35,7 @@ struct TensorListMetadata { }; template -__global__ void multi_tensor_apply_kernel(int chunk_size, +__global__ void multi_tensor_apply_kernel(int64_t chunk_size, volatile int* noop_flag, T tl, U callable, @@ -46,8 +46,8 @@ __global__ void multi_tensor_apply_kernel(int chunk_size, } template -void multi_tensor_apply(int block_size, - int chunk_size, +void multi_tensor_apply(int64_t block_size, + int64_t chunk_size, const at::Tensor& noop_flag, const std::vector>& tensor_lists, T callable, @@ -91,9 +91,9 @@ void multi_tensor_apply(int block_size, tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr(); loc_tensor_info++; - int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size; + auto chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size; - for (int chunk = 0; chunk < chunks_this_tensor; chunk++) { + for (auto chunk = 0; chunk < chunks_this_tensor; chunk++) { // std::cout << chunks_this_tensor << std::endl; tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1; tl.block_to_chunk[loc_block_info] = chunk; From c292b03a403cf0f6fd747f6106ad40060c70d3f8 Mon Sep 17 00:00:00 2001 From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com> Date: Mon, 22 Apr 2024 12:50:15 -0700 Subject: [PATCH 31/36] Improve parallel process of universal checkpoint conversion (#5343) The conversion script from a regular checkpoint to the universal one runs the followings in parallel. 1. extracts zero sharded optimizer states 2. merge the shards However, it passes `map()` a set of only a few tasks (the number specified as workers). Thus it needs to wait for the slowest tasks to finish for every set. This PR submits all the tasks to the pool and wait until the futures get ready. We can keep all workers running. --------- Co-authored-by: Olatunji Ruwase --- deepspeed/checkpoint/ds_to_universal.py | 34 ++++++++----------------- 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/deepspeed/checkpoint/ds_to_universal.py b/deepspeed/checkpoint/ds_to_universal.py index 63fa866718de..b1a8276589b6 100755 --- a/deepspeed/checkpoint/ds_to_universal.py +++ b/deepspeed/checkpoint/ds_to_universal.py @@ -10,7 +10,7 @@ import argparse import glob import itertools -import multiprocessing +from concurrent.futures import ProcessPoolExecutor import os import re import shutil @@ -292,27 +292,18 @@ def get_matched_sub_params_pattern(name_): return unmatched_patterns -def _get_chunks(l, n): - for i in range(0, len(l), n): - yield l[i:i + n] - - def _do_parallel_work(do_work, work_chunks, num_workers): + results = [] if num_workers > 1: - pool = multiprocessing.Pool(num_workers) - results = [] - for batch in tqdm.tqdm(work_chunks): - res = pool.map(do_work, batch) - results.extend(res) - pool.close() - pool.join() + with ProcessPoolExecutor(max_workers=num_workers) as executor: + future_list = [executor.submit(do_work, work) for work in work_chunks] + for f in tqdm.tqdm(future_list): + results.append(f.result()) else: # No parallel pass for unit testing # We can't create child processes in tests - results = [] - for batch in tqdm.tqdm(work_chunks): - res = [do_work(x) for x in batch] - results.extend(res) + for work in tqdm.tqdm(work_chunks): + results.append(do_work(work)) return results @@ -321,20 +312,15 @@ def _extract_zero_shard_files(args, ds_checkpoint, temp_dir): itertools.product(range(ds_checkpoint.pp_degree), range(ds_checkpoint.tp_degree), range(ds_checkpoint.dp_degree))) #pprint(f'{_3d_range_list=}') - work_chunks = list(_get_chunks(_3d_range_list, args.num_extract_workers)) - #pprint(f'{work_chunks=}') - # extract_zero_shards(temp_dir, ds_checkpoint, _3d_range_list[0]) do_work = partial(extract_zero_shards, temp_dir, ds_checkpoint) - _do_parallel_work(do_work, work_chunks, args.num_extract_workers) + _do_parallel_work(do_work, _3d_range_list, args.num_extract_workers) def _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir): - work_chunks = list(_get_chunks(list(slice_shapes.items()), args.num_merge_workers)) - #pprint(work_chunks) zero_output_folder = os.path.join(args.output_folder, "zero") do_work = partial(merge_tp_slices, ds_checkpoint, zero_output_folder, temp_dir, ds_checkpoint.tp_degree) - unmatched_patterns_lists = _do_parallel_work(do_work, work_chunks, args.num_merge_workers) + unmatched_patterns_lists = _do_parallel_work(do_work, list(slice_shapes.items()), args.num_merge_workers) # verify that all patterns were used # if a pattern was not used by any of the workers, then it was not used at all -> assert/alert From c66bc4269e9484b6e57d6f5521df02c70d399246 Mon Sep 17 00:00:00 2001 From: inkcherry Date: Tue, 23 Apr 2024 07:27:09 +0800 Subject: [PATCH 32/36] set the default to use set_to_none for clearing gradients in BF16 optimizer. (#5434) as discussed in #5175, set the default to use set_to_none for clearing gradients in BF16 optimizer. Additionally, for the case of zero clearing, use foreach_zero. Verified correctness with mega-ds llama 7B training. FYI @loadams --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- deepspeed/runtime/bf16_optimizer.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py index f970e582b354..1f3365b20f4e 100644 --- a/deepspeed/runtime/bf16_optimizer.py +++ b/deepspeed/runtime/bf16_optimizer.py @@ -341,7 +341,7 @@ def _update_hp_grad(self, lp, group_idx, param_idx, clear_lp_grads): # clear gradients if clear_lp_grads: - lp.grad._zero() + lp.grad.zero_() @torch.no_grad() def _update_hp_grads_func(self, clear_lp_grads=False): @@ -441,11 +441,20 @@ def clear_hp_grads(self): self.fp32_groups_has_gradients[i] = [False] * len(group) def clear_lp_grads(self): + + # using zero_() fixed memory address for graph replay + set_to_none = False if self.graph_harvesting else True + zero_grads_list = [] for group in self.bf16_groups: for param in group: - if param.grad is not None: - # Using zero_() fixed memory address for graph replay - param.grad.zero_() + if set_to_none: + param.grad = None + elif param.grad is not None: + if param.grad.grad_fn is not None: + param.grad.detach_() + zero_grads_list.append(param.grad) + if not set_to_none and len(zero_grads_list) > 0: + torch._foreach_zero_(zero_grads_list) def state_dict(self): state_dict = {} From 5e6c9b931184bf8f0a245a2fef183078139b6d7e Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Tue, 23 Apr 2024 12:24:37 -0700 Subject: [PATCH 33/36] OptimizedLinear implementation (#5355) Optimized version of `nn.Linear` that adds features such as: * LoRA w. base weight sharding * FP [6,8,12] quantization Depends on #5336 being merged first Co-authored-by: @rajhans Co-authored-by: @aurickq --------- Co-authored-by: Rajhans Samdani Co-authored-by: Jeff Rasley --- deepspeed/linear/__init__.py | 7 ++ deepspeed/linear/config.py | 39 +++++++ deepspeed/linear/optimized_linear.py | 150 +++++++++++++++++++++++++ deepspeed/linear/quantization.py | 137 ++++++++++++++++++++++ deepspeed/ops/fp_quantizer/__init__.py | 2 +- deepspeed/ops/fp_quantizer/quantize.py | 33 +++++- tests/unit/linear/test_linear.py | 128 +++++++++++++++++++++ tests/unit/linear/test_quant_param.py | 58 ++++++++++ 8 files changed, 550 insertions(+), 4 deletions(-) create mode 100644 deepspeed/linear/__init__.py create mode 100644 deepspeed/linear/config.py create mode 100644 deepspeed/linear/optimized_linear.py create mode 100644 deepspeed/linear/quantization.py create mode 100644 tests/unit/linear/test_linear.py create mode 100644 tests/unit/linear/test_quant_param.py diff --git a/deepspeed/linear/__init__.py b/deepspeed/linear/__init__.py new file mode 100644 index 000000000000..a27f1c3eaee7 --- /dev/null +++ b/deepspeed/linear/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from .optimized_linear import OptimizedLinear +from .config import LoRAConfig, QuantizationConfig diff --git a/deepspeed/linear/config.py b/deepspeed/linear/config.py new file mode 100644 index 000000000000..ae9050a3c92b --- /dev/null +++ b/deepspeed/linear/config.py @@ -0,0 +1,39 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from dataclasses import dataclass + + +@dataclass +class LoRAConfig: + """ + Configuration settings for LoRAOptimizedLinear. + + Attributes: + lora_r (int): LoRA attention dimension, also know as the rank. Defaults is 64. + lora_alpha (float): LoRA scaling factor, default is 16. + base_weight_sharding (int): The degree to which the base weights are sharded, + should typically be set to the data-parallel world size to maximize the memory + reduction benefits. Defaults to 1, which means this feature is disabled. + """ + lora_r: int = 64 + lora_alpha: float = 16. + base_weight_sharding: int = 1 + + +@dataclass +class QuantizationConfig: + """ + Configuration settings for quantization for LoRAOptimizedLinear, QuantizedLinear, + and QuantizedParameter + + Attributes: + q_bits (int): The number of bits used for quantization. Default is 8. + mantissa_bits (int): The number of bits reserved for the mantissa in fixed-point quantization. Default is 3. + group_size (int): The size of the group used for quantization. Default is 512. + """ + q_bits: int = 8 + mantissa_bits: int = 3 + group_size: int = 512 diff --git a/deepspeed/linear/optimized_linear.py b/deepspeed/linear/optimized_linear.py new file mode 100644 index 000000000000..138bd493ffc7 --- /dev/null +++ b/deepspeed/linear/optimized_linear.py @@ -0,0 +1,150 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import torch +import math +import torch.nn as nn +import torch.nn.functional as F +from dataclasses import is_dataclass +from deepspeed.accelerator import get_accelerator +import deepspeed.comm as dist + +from .config import LoRAConfig, QuantizationConfig +from .quantization import QuantizedParameter, QuantizedLinear + + +class OptimizedLinear(nn.Module): + """ + Optimized version of nn.Linear that adds features such as: + * LoRA w. base weight sharding + * FP [6,8,12] quantization + + Arguments: + input_dim: Required: size of each input sample + output_dim: Required: size of each output sample + bias: Optional: If set to False, the layer will not learn an additive bias. Default: False + lora_config: Optional: LoRAConfig defining lora features and base-weight-sharding degree + quantization_config: Optional: QuantizationConfig defining quantization features + dtype: Optional: parameter dtype, only supports bfloat16 currently + + Returns: + Returns a new nn.Module depending on the input config. Either native + torch.nn.Linear, QuantizedLinear, or the full-featured DSOptimizedLinear. + """ + + def __new__(self, + input_dim: int, + output_dim: int, + bias: bool = False, + lora_config: LoRAConfig = None, + quantization_config: QuantizationConfig = None, + dtype=torch.bfloat16): + + if quantization_config is not None and not is_dataclass(quantization_config): + raise ValueError(f"Expecting QuantizationConfig but received {type(quantization_config)}") + if lora_config is not None and not is_dataclass(lora_config): + raise ValueError(f"Expecting LoRAConfig but received {type(lora_config)}") + if lora_config is None and quantization_config is None: + # Everything disabled, fall back to normal nn.Linear + self = nn.Linear(input_dim, output_dim, bias=bias, dtype=dtype) + + elif lora_config: + # lora enabled, quantization may or may not be + self = LoRAOptimizedLinear(input_dim=input_dim, + output_dim=output_dim, + bias=bias, + lora_config=lora_config, + quantization_config=quantization_config, + dtype=dtype) + + elif quantization_config: + # only quantization enabled, no lora + self = QuantizedLinear(input_dim=input_dim, + output_dim=output_dim, + bias=bias, + quantization_config=quantization_config, + dtype=dtype) + return self + + +class LoRAOptimizedLinear(nn.Module): + + def __init__(self, + input_dim: int, + output_dim: int, + bias: bool = False, + lora_config: LoRAConfig = None, + quantization_config: QuantizationConfig = None, + device=None, + dtype=torch.bfloat16): + super().__init__() + self.input_dim = input_dim + self.output_dim = output_dim + self.bias = bias + self.lora_config = lora_config + self.quantization_config = quantization_config + device = get_accelerator().current_device() if device is None else device + assert self.lora_config is not None, "DSOptimizedLinear requires a LoRA config" + + self.zero_shards = self.lora_config.base_weight_sharding + self.sharded_weight_size = int(float(self.input_dim) // self.zero_shards) + w = torch.nn.Parameter(torch.empty((self.output_dim, self.sharded_weight_size), dtype=dtype)) + torch.nn.init.xavier_uniform_(w) + + if self.quantization_config is not None: + assert dtype == torch.bfloat16, "only bfloat16 is supported when using quantization" + self.base_weight = QuantizedParameter(w, quantization_config=quantization_config) + else: + self.base_weight = w + + self.base_weight.requires_grad = False + + # Use RS lora for now. + self.lora_scaling_factor = self.lora_config.lora_alpha / math.sqrt(self.lora_config.lora_r) + # Keeping lora weights in bf16 precision for ease of training. + self.lora_weight_1 = nn.Linear(self.input_dim, + self.lora_config.lora_r, + bias=self.bias, + device=device, + dtype=dtype) + self.lora_weight_2 = nn.Linear(self.lora_config.lora_r, + self.output_dim, + bias=self.bias, + device=device, + dtype=dtype) + self.lora_weight_1.weight.requires_grad = True + self.lora_weight_2.weight.requires_grad = True + + def full_weight(self): + # This assumes weights are evenly sharded across gpus. which might not be correct. + # in that case, we should flatten before all_gather. + local_weight = self.base_weight.dequantized() if isinstance(self.base_weight, + QuantizedParameter) else self.base_weight + tensor_list = [ + torch.zeros_like(local_weight, device=local_weight.device, dtype=local_weight.dtype) + for _ in range(self.zero_shards) + ] + dist.all_gather(tensor_list, local_weight) + weight = nn.Parameter(torch.cat([tensor for tensor in tensor_list], dim=1)) + return weight + + def linear_without_F_linear(self, input, weight): + output = torch.mm(input.reshape(-1, input.shape[-1]), weight) + output = output.view(*input.shape[:-1], weight.shape[1]) + return output + + def forward(self, input_tensor): + # Gather the sharded base weight + if self.zero_shards > 1: + with torch.no_grad(): + base_weight = self.full_weight() + elif self.quantization_config: + base_weight = self.base_weight.dequantized() + else: + base_weight = self.base_weight + + base_weight_output = F.linear(input_tensor, base_weight) + lora_output = self.lora_weight_2(self.lora_weight_1(input_tensor)) + return base_weight_output + self.lora_scaling_factor * lora_output diff --git a/deepspeed/linear/quantization.py b/deepspeed/linear/quantization.py new file mode 100644 index 000000000000..f5343af45fb8 --- /dev/null +++ b/deepspeed/linear/quantization.py @@ -0,0 +1,137 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import copy +import torch +import torch.nn as nn +import torch.nn.functional as F + +from typing import Optional + +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.fp_quantizer import Quantizer, FP_Quantize +from .config import QuantizationConfig + + +class QuantizedParameter(nn.Parameter): + """ + Quantized parameter class that implements weight quantization. Weights + are stored in quantized form on GPUs, and can be dequantized on-the-fly when + needed by the model. The weights are actually quantized during any `.to(device)`. + + Arguments: + data (Tensor): parameter tensor. + requires_grad (bool, optional): if the parameter requires gradient. Defaults + to False and is not supported to be True. Argument provided only for interface + compatibility with torch.nn.Parameter. + quantization_config (QuantizationConfig, optional): + quantizer (Quantizer, optional): Defaults to FP_Quantize but can be any quantizer + that implements deepspeed.ops.fp_quantizer.Quantizer. This argument is also + required since the quantizer is stashed in the Parameter itself, some models + may clone the Parameter by passing an attribute __dict__. For an example, see + tests/unit/linear/test_quant_param.py::TestQuantParam::test_hf_clone + """ + + def __new__( + cls, + data: Optional[torch.Tensor] = None, + requires_grad: bool = False, # quantized weights must be frozen + quantization_config: QuantizationConfig = None, + quantizer: Quantizer = None, + ): + if requires_grad: + raise ValueError(f"requires_grad=True is not supported with QuantizedParameter") + if data is None: + data = torch.empty(0) + self = torch.Tensor._make_subclass(cls, data, requires_grad) + self.quantization_config = QuantizationConfig() if quantization_config is None else quantization_config + if quantizer is not None: + self.quantizer = quantizer + else: + # if FPQuantizerBuilder is not compatible in this env this init will fail + self.quantizer = FP_Quantize(group_size=self.quantization_config.group_size) + self._ensure_quantized(self) + return self + + def _ensure_quantized(self, tensor: torch.Tensor): + # If the tensor is on the accelerator and is not quantized, then quantize it in-place. + if get_accelerator().on_accelerator(tensor) and tensor.dtype != torch.int8: + with get_accelerator().stream(get_accelerator().current_stream(tensor.device)): + tensor.data = self.quantizer.quantize(tensor.data, + q_bits=self.quantization_config.q_bits, + q_mantisa_bits=self.quantization_config.mantissa_bits) + assert tensor.dtype == torch.int8 + + def dequantized(self) -> torch.Tensor: + """ + Return a tensor containing the dequantized weights of this parameter. + """ + if get_accelerator().on_accelerator(self.data) and self.data.dtype == torch.int8: + with get_accelerator().stream(get_accelerator().current_stream(self.data.device)): + return self.quantizer.dequantize(self.data, + q_bits=self.quantization_config.q_bits, + q_mantisa_bits=self.quantization_config.mantissa_bits) + return self.data + + def __getstate__(self): + state = self.__dict__ + state["data"] = self.data + state["quantization_config"] = self.quantization_config + state["requires_grad"] = self.requires_grad + return state + + def __setstate__(self, state): + self.quantizer = state["quantizer"] + self.quantization_config = state["quantization_config"] + self.data = state["data"] + self.requires_grad = state["requires_grad"] + + def __deepcopy__(self, memo): + new_instance = type(self).__new__(type(self)) + state = self.__getstate__() + new_instance.__setstate__(state) + new_instance.quantizer = copy.deepcopy(state["quantizer"]) + new_instance.quantization_config = copy.deepcopy(state["quantization_config"]) + new_instance.data = copy.deepcopy(state["data"]) + return new_instance + + def __copy__(self): + new_instance = type(self).__new__(type(self)) + state = self.__getstate__() + new_instance.__setstate__(state) + return new_instance + + def cuda(self, device=None, non_blocking=False): + return self.to(device="cuda" if device is None else device, non_blocking=non_blocking) + + def to(self, *args, **kwargs): + """ + Move the parameter to the given device. Then, if the device is a cuda device, + quantize it. + """ + tensor = super().to(*args, **kwargs) + self._ensure_quantized(tensor) + return tensor + + +class QuantizedLinear(nn.Linear): + """ + Linear layer that implements weight quantization. Parameters + are stored via `QuantizedParameter` and are dequantized on-the-fly during any + forward pass. + """ + + def __init__(self, + input_dim: int, + output_dim: int, + bias: bool = False, + quantization_config: QuantizationConfig = None, + dtype=torch.bfloat16): + super().__init__(input_dim, output_dim, bias=bias, dtype=dtype) + assert dtype == torch.bfloat16, "currently only supports bfloat16 dtype" + self.weight = QuantizedParameter(self.weight.data, quantization_config=quantization_config) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return F.linear(input, self.weight.dequantized(), self.bias) diff --git a/deepspeed/ops/fp_quantizer/__init__.py b/deepspeed/ops/fp_quantizer/__init__.py index 5575f3567185..995bbae4aeaf 100644 --- a/deepspeed/ops/fp_quantizer/__init__.py +++ b/deepspeed/ops/fp_quantizer/__init__.py @@ -3,4 +3,4 @@ # DeepSpeed Team -from .quantize import FP_Quantize +from .quantize import FP_Quantize, Quantizer diff --git a/deepspeed/ops/fp_quantizer/quantize.py b/deepspeed/ops/fp_quantizer/quantize.py index 0d4bf7bc6db1..f8435bda16c1 100644 --- a/deepspeed/ops/fp_quantizer/quantize.py +++ b/deepspeed/ops/fp_quantizer/quantize.py @@ -4,20 +4,47 @@ # DeepSpeed Team import torch +import abc +from abc import ABC from deepspeed.ops.op_builder import FPQuantizerBuilder fp_quant_module = None -class FP_Quantize: +class Quantizer(ABC): + """ + Abstract Quantizer class that implmenents quantize/dequantize methods. + + Arguments: + group_size (int, optional): number of values or elements that are grouped + together for the quantization process. + """ + + def __init__(self, group_size=512) -> None: + self.group_size = group_size + + @abc.abstractmethod + def quantize(self, + input, + q_bits=8, + q_mantisa_bits=3, + stochastic_mode=False, + return_meta_tensor=False) -> torch.Tensor: + ... + + @abc.abstractmethod + def dequantize(self, input_q, fp_out=None, q_bits=8, q_mantisa_bits=3, scale=None) -> torch.Tensor: + ... + + +class FP_Quantize(Quantizer): def __init__(self, group_size=512) -> None: global fp_quant_module + super().__init__(group_size=group_size) if fp_quant_module is None: fp_quant_module = FPQuantizerBuilder().load() - - self.group_size = group_size self.orig_dtype = None def quantize(self, diff --git a/tests/unit/linear/test_linear.py b/tests/unit/linear/test_linear.py new file mode 100644 index 000000000000..ccd26b4cd726 --- /dev/null +++ b/tests/unit/linear/test_linear.py @@ -0,0 +1,128 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import pytest +import torch +import deepspeed +import deepspeed.comm as dist + +from deepspeed.accelerator import get_accelerator +from deepspeed.linear import OptimizedLinear, LoRAConfig, QuantizationConfig +from unit.common import DistributedTest + +from deepspeed.ops.op_builder import FPQuantizerBuilder + +if not deepspeed.ops.__compatible_ops__[FPQuantizerBuilder.NAME]: + pytest.skip("FPQuantizer op is not available on this system", allow_module_level=True) + + +class TestBasicLinear(DistributedTest): + world_size = 2 + + def test(self): + lora_config = None + quantization_config = None + + input_features = 64 # Number of input features + output_features = 64 # Number of output features + batch_size = 1 # Number of samples in a batch + + linear_layer = OptimizedLinear(input_dim=input_features, + output_dim=output_features, + lora_config=lora_config, + quantization_config=quantization_config, + dtype=torch.bfloat16) + + dummy_input = torch.rand(batch_size, input_features, dtype=torch.bfloat16) + output = linear_layer(dummy_input) + assert output.shape == (batch_size, output_features) + + +@pytest.mark.parametrize("base_weight_sharding", [1, 2]) +class TestLoRALinear(DistributedTest): + world_size = 2 + + def test(self, base_weight_sharding): + rank = dist.get_rank() + lora_config = None + quantization_config = None + + input_features = 64 # Number of input features + output_features = 64 # Number of output features + batch_size = 5 # Number of samples in a batch + + lora_config = LoRAConfig(lora_r=16, lora_alpha=16, base_weight_sharding=base_weight_sharding) + + linear_layer = OptimizedLinear(input_dim=input_features, + output_dim=output_features, + lora_config=lora_config, + quantization_config=quantization_config, + dtype=torch.bfloat16) + device = get_accelerator().current_device_name() + linear_layer = linear_layer.to(device) + if rank == 0: + for n, p in linear_layer.named_parameters(): + print(f"{n}, {p.shape}") + + dummy_input = torch.rand(batch_size, input_features, device=device, dtype=torch.bfloat16) + + output = linear_layer(dummy_input) + assert output.shape == (batch_size, output_features) + + +@pytest.mark.parametrize("q_bits", [8, 6]) +class TestQuantLinear(DistributedTest): + world_size = 2 + + def test(self, q_bits): + rank = dist.get_rank() + lora_config = None + + input_features = 64 # Number of input features + output_features = 64 # Number of output features + batch_size = 5 # Number of samples in a batch + + lora_config = None + quantization_config = QuantizationConfig(q_bits=q_bits) + + linear_layer = OptimizedLinear(input_dim=input_features, + output_dim=output_features, + lora_config=lora_config, + quantization_config=quantization_config, + dtype=torch.bfloat16) + device = get_accelerator().current_device_name() + linear_layer = linear_layer.to(device) + dummy_input = torch.rand([batch_size, input_features], device=device, dtype=torch.bfloat16) + + output = linear_layer(dummy_input) + assert output.shape == (batch_size, output_features) + + +@pytest.mark.parametrize("base_weight_sharding", [1, 2], ids=['bws1', 'bws2']) +@pytest.mark.parametrize("q_bits", [8, 6], ids=['qbit8', 'qbit6']) +class TestOptimizedLinear(DistributedTest): + world_size = 2 + + def test(self, base_weight_sharding, q_bits): + rank = dist.get_rank() + lora_config = None + + input_features = 64 # Number of input features + output_features = 64 # Number of output features + batch_size = 5 # Number of samples in a batch + + lora_config = LoRAConfig(lora_r=16, lora_alpha=16, base_weight_sharding=base_weight_sharding) + quantization_config = QuantizationConfig(q_bits=q_bits) + + linear_layer = OptimizedLinear(input_dim=input_features, + output_dim=output_features, + lora_config=lora_config, + quantization_config=quantization_config, + dtype=torch.bfloat16) + device = get_accelerator().current_device_name() + linear_layer = linear_layer.to(device) + dummy_input = torch.rand([batch_size, input_features], device=device, dtype=torch.bfloat16) + output = linear_layer(dummy_input) + assert output.shape == (batch_size, output_features) diff --git a/tests/unit/linear/test_quant_param.py b/tests/unit/linear/test_quant_param.py new file mode 100644 index 000000000000..9479b3cba8a0 --- /dev/null +++ b/tests/unit/linear/test_quant_param.py @@ -0,0 +1,58 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import pytest +import torch +import deepspeed + +from deepspeed.accelerator import get_accelerator +from deepspeed.linear.quantization import QuantizedParameter +from deepspeed.linear.config import QuantizationConfig + +from deepspeed.ops.op_builder import FPQuantizerBuilder + +from unit.common import DistributedTest + +if not deepspeed.ops.__compatible_ops__[FPQuantizerBuilder.NAME]: + pytest.skip("FPQuantizer op is not available on this system", allow_module_level=True) + + +class TestQuantParam(DistributedTest): + world_size = 1 + + @pytest.mark.parametrize('dtype', [torch.half, torch.float]) + def test_unsupported_dtypes(self, dtype): + device = get_accelerator().current_device_name() + data = torch.rand(5, 5, device='cpu', dtype=dtype) + qp = QuantizedParameter(data) + with pytest.raises(AssertionError): + qp.to(device) + + def test_requires_grad(self): + data = torch.rand(5, 5, dtype=torch.bfloat16) + with pytest.raises(ValueError): + QuantizedParameter(data, requires_grad=True) + + def test_move_to_accelerator(self): + device = get_accelerator().current_device() + data = torch.rand(5, 5, device='cpu', dtype=torch.bfloat16) + qp = QuantizedParameter(data) + assert qp.device == torch.device('cpu') + qp = qp.to(get_accelerator().current_device_name()) + assert qp.device == torch.device(device) + assert qp.dtype == torch.int8 + + def test_hf_clone(self): + device = get_accelerator().current_device_name() + data = torch.rand(5, 5, device=device, dtype=torch.bfloat16) + + quantization_config = QuantizationConfig(q_bits=6) + qp = QuantizedParameter(data, quantization_config=quantization_config) + + # should be able to clone parameter via dict, HF expects this to work + qp_copy = QuantizedParameter(qp.data, **qp.__dict__) + + assert all(qp.data == qp_copy.data) + assert qp.quantization_config == qp_copy.quantization_config From ad2027952f9730cbd1a8385e4a441e470248645e Mon Sep 17 00:00:00 2001 From: Jhonso7393 <167781426+Jhonso7393@users.noreply.github.com> Date: Tue, 23 Apr 2024 23:45:47 +0300 Subject: [PATCH 34/36] Update README.md (#5453) Fixing a minor typo at the README file Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 201b9016f8ab..f9d81eddfdae 100755 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat). * [2024/03] [DeepSpeed-FP6:The power of FP6-Centric Serving for Large Language Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)] -* [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19) +* [2024/01] [DeepSpeed-FastGen: Introducing Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19) * [2023/11] [Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed](https://github.com/microsoft/DeepSpeed/tree/master/blogs/intel-inference) [[Intel version]](https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html) * [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp) * [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md)] From 5f631abc2f930ecece38fae05dc9bd3923c555dd Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Tue, 23 Apr 2024 16:24:12 -0700 Subject: [PATCH 35/36] Update PyTest torch version to match PyTorch latest official (2.3.0) (#5454) --- .github/workflows/cpu-torch-latest.yml | 4 ++-- .github/workflows/nv-torch-latest-v100.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml index 9c1ad02f75a6..5727ff2e1cde 100644 --- a/.github/workflows/cpu-torch-latest.yml +++ b/.github/workflows/cpu-torch-latest.yml @@ -50,5 +50,5 @@ jobs: run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.2" - TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.2" + TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.3" + TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.3" diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml index 3ca8ac43dfa4..2e0490c18ba7 100644 --- a/.github/workflows/nv-torch-latest-v100.yml +++ b/.github/workflows/nv-torch-latest-v100.yml @@ -29,7 +29,7 @@ jobs: - name: Install pytorch run: | - pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118 + pip install -U --cache-dir $TORCH_CACHE torch==2.2.2 torchvision --index-url https://download.pytorch.org/whl/cu118 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" From fbdf0eaf15f45cd2a8b5846ad9428609a3860b41 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Tue, 23 Apr 2024 16:27:27 -0700 Subject: [PATCH 36/36] Update version.txt after 0.14.2 release (#5458) **Auto-generated PR to update version.txt after a DeepSpeed release** Released version - 0.14.2 Author - @loadams Co-authored-by: loadams --- version.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.txt b/version.txt index e867cc2a66a8..ac4a79626c87 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.14.2 +0.14.3