From ca4bc66b5401eb3ecd46d56642fdc4eb658c1c21 Mon Sep 17 00:00:00 2001
From: vikram singh shekhawat <vshekhawat@habana.ai>
Date: Tue, 9 Apr 2024 03:11:03 +0530
Subject: [PATCH 01/36] Increase CI coverage for Gaudi2 accelerator. (#5358)

This change increase CI coverage for HPU accelerator.

---------

Co-authored-by: Logan Adams <loadams@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 .github/workflows/hpu-gaudi2.yml | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml
index 1881c968b560..12d8ee661fa8 100644
--- a/.github/workflows/hpu-gaudi2.yml
+++ b/.github/workflows/hpu-gaudi2.yml
@@ -8,7 +8,23 @@ on:
     paths:
       - ".github/workflows/hpu-gaudi2.yml"
       - "accelerator/hpu_accelerator.py"
-
+      - "op_builder/hpu/**"
+      - "deepspeed/runtime/engine.py"
+      - "deepspeed/runtime/bf16_optimizer.py"
+      - "deepspeed/runtime/zero/stage_1_and_2.py"
+      - "deepspeed/runtime/zero/stage3.py"
+      - "deepspeed/runtime/zero/partition_parameters.py"
+      - "deepspeed/runtime/zero/partitioned_param_coordinator.py"
+      - "deepspeed/runtime/zero/parameter_offload.py"
+      - "deepspeed/runtime/pipe/engine.py"
+      - "deepspeed/runtime/utils.py"
+      - "deepspeed/inference/engine.py"
+      - "deepspeed/module_inject/auto_tp.py"
+      - "deepspeed/module_inject/replace_module.py"
+      - "deepspeed/module_inject/load_checkpoint.py"
+      - "deepspeed/module_inject/inject.py"
+      - "deepspeed/ops/transformer/**"
+      - "deepspeed/ops/adam/**"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}

From 873738897805f8bff31fd8145bf67ef9e9379095 Mon Sep 17 00:00:00 2001
From: Liangliang-Ma <1906710196@qq.com>
Date: Wed, 10 Apr 2024 01:17:25 +0800
Subject: [PATCH 02/36] Add CI for Intel XPU/Max1100 (#5376)

Add basic workflow for tests on intel xpu. Currently we have part of
tests enabled. We will add more tests in later PRs.

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 .github/workflows/xpu-max1100.yml | 75 +++++++++++++++++++++++++++++++
 tests/unit/util.py                |  3 ++
 2 files changed, 78 insertions(+)
 create mode 100644 .github/workflows/xpu-max1100.yml

diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml
new file mode 100644
index 000000000000..078cd61801a4
--- /dev/null
+++ b/.github/workflows/xpu-max1100.yml
@@ -0,0 +1,75 @@
+name: xpu-max1100
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - ".github/workflows/xpu.yml"
+      - "accelerator/xpu_accelerator.py"
+      - "accelerator/abstract_accelerator.py"
+      - "accelerator/cpu_accelerator.py"
+      - "accelerator/real_accelerator.py"
+      - "deepspeed/runtime/engine.py"
+      - "deepspeed/runtime/bf16_optimizer.py"
+      - "deepspeed/runtime/zero/stage_1_and_2.py"
+      - "deepspeed/runtime/zero/stage3.py"
+      - "deepspeed/runtime/zero/partition_parameters.py"
+      - "deepspeed/runtime/zero/partitioned_param_coordinator.py"
+      - "deepspeed/runtime/zero/parameter_offload.py"
+      - "deepspeed/runtime/pipe/engine.py"
+      - "deepspeed/runtime/utils.py"
+      - "deepspeed/inference/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, intel, xpu]
+    container:
+      image: intel/intel-extension-for-pytorch:2.1.20-xpu
+      ports:
+        - 80
+      options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Check container state
+      shell: bash
+      run: |
+        ldd --version
+        python -c "import torch; print('torch:', torch.__version__, torch)"
+        python -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
+
+    - name: Install deepspeed
+      run: |
+        pip install py-cpuinfo
+        pip install .[dev,autotuning]
+        ds_report
+        python -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
+
+    - name: Python environment
+      run: |
+        pip list
+
+    - name: Unit tests
+      run: |
+        pip install pytest pytest-timeout tabulate
+        cd tests/unit
+        pytest --verbose accelerator/*
+        pytest --verbose autotuning/*
+        pytest --verbose checkpoint/test_reshape_checkpoint.py
+        pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py
+        pytest --verbose runtime/test_ds_config_model.py
+        pytest --verbose runtime/pipe/test_pipe_schedule.py
+        pytest --verbose runtime/zero/test_zero_config.py
+        pytest --verbose runtime/zero/test_zero_tiled.py
+        pytest --verbose runtime/zero/test_zeropp.py
diff --git a/tests/unit/util.py b/tests/unit/util.py
index e8e0f476371b..5acf580ed3c8 100644
--- a/tests/unit/util.py
+++ b/tests/unit/util.py
@@ -48,6 +48,7 @@ def bf16_required_version_check(accelerator_check=True):
     nccl_version_available = NCCL_MAJOR > 2 or (NCCL_MAJOR == 2 and NCCL_MINOR >= 10)
     npu_available = get_accelerator().device_name() == 'npu'
     hpu_available = get_accelerator().device_name() == 'hpu'
+    xpu_available = get_accelerator().device_name() == 'xpu'
 
     if torch_version_available and cuda_version_available and nccl_version_available and accelerator_pass:
         return True
@@ -55,6 +56,8 @@ def bf16_required_version_check(accelerator_check=True):
         return True
     elif hpu_available:
         return True
+    elif xpu_available:
+        return True
     else:
         return False
 

From 7e126d20b968db5e2a31f198bab6fd79ab33260b Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Tue, 9 Apr 2024 14:18:49 -0700
Subject: [PATCH 03/36] Update path name on xpu-max1100.yml, add badge in
 README (#5386)

- Fixes xpu-max1100 not running on PR because of incorrect yml name.
---
 .github/workflows/xpu-max1100.yml | 3 +--
 README.md                         | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml
index 078cd61801a4..43a29598d12c 100644
--- a/.github/workflows/xpu-max1100.yml
+++ b/.github/workflows/xpu-max1100.yml
@@ -6,7 +6,7 @@ on:
     - cron: "0 0 * * *"
   pull_request:
     paths:
-      - ".github/workflows/xpu.yml"
+      - ".github/workflows/xpu-max1100.yml"
       - "accelerator/xpu_accelerator.py"
       - "accelerator/abstract_accelerator.py"
       - "accelerator/cpu_accelerator.py"
@@ -20,7 +20,6 @@ on:
       - "deepspeed/runtime/zero/parameter_offload.py"
       - "deepspeed/runtime/pipe/engine.py"
       - "deepspeed/runtime/utils.py"
-      - "deepspeed/inference/**"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
diff --git a/README.md b/README.md
index 434a0dd53586..5f092a56c117 100755
--- a/README.md
+++ b/README.md
@@ -133,6 +133,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram
 | AMD | [![amd-mi200](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml) |
 | CPU | [![torch-latest-cpu](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml) [![cpu-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml) |
 | Intel Gaudi | [![hpu-gaudi2](https://github.com/microsoft/DeepSpeed/actions/workflows/hpu-gaudi2.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/hpu-gaudi2.yml) |
+| Intel XPU | [![xpu-max1100](https://github.com/microsoft/DeepSpeed/actions/workflows/xpu-max1100.yml/badge.svg)?branch=master](https://github.com/microsoft/DeepSpeed/actions/workflows/xpu-max1100.yml) |
 | PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
 | Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) |
 | Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) |

From eb216a837b988caeef94c0dd6036e0d0f351d4f7 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 9 Apr 2024 16:18:24 -0700
Subject: [PATCH 04/36] Update XPU workflow too

---
 .github/workflows/xpu-max1100.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml
index 43a29598d12c..f83c58dc7ff5 100644
--- a/.github/workflows/xpu-max1100.yml
+++ b/.github/workflows/xpu-max1100.yml
@@ -40,7 +40,7 @@ jobs:
       options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Check container state
       shell: bash
       run: |

From 62ab18102036a9079e0fd569cb149d6bdbd64d7b Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 9 Apr 2024 16:18:59 -0700
Subject: [PATCH 05/36] Revert "Update XPU workflow too"

This reverts commit eb216a837b988caeef94c0dd6036e0d0f351d4f7.
---
 .github/workflows/xpu-max1100.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml
index f83c58dc7ff5..43a29598d12c 100644
--- a/.github/workflows/xpu-max1100.yml
+++ b/.github/workflows/xpu-max1100.yml
@@ -40,7 +40,7 @@ jobs:
       options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
 
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v3
     - name: Check container state
       shell: bash
       run: |

From 63029e8f5d1d248a47ad52c497899cbefffbfee9 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Tue, 9 Apr 2024 17:28:21 -0700
Subject: [PATCH 06/36] Update checkout action on workflows on ubuntu 20.04
 (#5387)

- Only workflows running Ubuntu 20.04 or later can be updated as the
GLIBC that is needed for node 20+ can be updated now.
- Workflows that aren't updated are running Ubuntu 18.04 or older, those
will need to be moved to updated images shortly and will be updated
later in the original PR, #5021

Sample warning that is resolved:

```
Node.js 16 actions are deprecated. Please update the following actions to use Node.js 20: actions/checkout@v3. For more information see: https://github.blog/changelog/2023-09-22-github-actions-transitioning-from-node-16-to-node-20/.
```
---
 .github/workflows/amd-mi200.yml             | 2 +-
 .github/workflows/cpu-torch-latest.yml      | 2 +-
 .github/workflows/formatting.yml            | 2 +-
 .github/workflows/hpu-gaudi2.yml            | 2 +-
 .github/workflows/nv-a6000.yml              | 2 +-
 .github/workflows/nv-accelerate-v100.yml    | 2 +-
 .github/workflows/nv-ds-chat.yml            | 2 +-
 .github/workflows/nv-h100.yml               | 2 +-
 .github/workflows/nv-inference.yml          | 2 +-
 .github/workflows/nv-mii.yml                | 2 +-
 .github/workflows/nv-nightly.yml            | 2 +-
 .github/workflows/nv-pre-compile-ops.yml    | 2 +-
 .github/workflows/nv-sd.yml                 | 2 +-
 .github/workflows/nv-torch-latest-v100.yml  | 2 +-
 .github/workflows/nv-torch-nightly-v100.yml | 2 +-
 .github/workflows/nv-transformers-v100.yml  | 2 +-
 .github/workflows/release.yml               | 2 +-
 .github/workflows/xpu-max1100.yml           | 2 +-
 18 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml
index e4b938d8e078..00ff72ac8929 100644
--- a/.github/workflows/amd-mi200.yml
+++ b/.github/workflows/amd-mi200.yml
@@ -21,7 +21,7 @@ jobs:
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml
index 5096de931be4..9c1ad02f75a6 100644
--- a/.github/workflows/cpu-torch-latest.yml
+++ b/.github/workflows/cpu-torch-latest.yml
@@ -22,7 +22,7 @@ jobs:
     runs-on: ubuntu-20.04
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
index 88dfa34a0a2b..d2554b7c0038 100644
--- a/.github/workflows/formatting.yml
+++ b/.github/workflows/formatting.yml
@@ -21,7 +21,7 @@ jobs:
     runs-on: ubuntu-20.04
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: environment
         run: |
diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml
index 12d8ee661fa8..a3c12d057cc9 100644
--- a/.github/workflows/hpu-gaudi2.yml
+++ b/.github/workflows/hpu-gaudi2.yml
@@ -99,7 +99,7 @@ jobs:
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Check container state
         run: |
diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
index 960e0203919e..3ce406948432 100644
--- a/.github/workflows/nv-a6000.yml
+++ b/.github/workflows/nv-a6000.yml
@@ -29,7 +29,7 @@ jobs:
       options: --gpus all --shm-size "8G"
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Check container state
         run: |
diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
index 1fccbece2994..915493bb3183 100644
--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@@ -22,7 +22,7 @@ jobs:
     runs-on: [self-hosted, nvidia, cu117, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml
index f61637be7e0e..94571eb101bb 100644
--- a/.github/workflows/nv-ds-chat.yml
+++ b/.github/workflows/nv-ds-chat.yml
@@ -24,7 +24,7 @@ jobs:
     runs-on: [self-hosted, nvidia, cu117, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
diff --git a/.github/workflows/nv-h100.yml b/.github/workflows/nv-h100.yml
index 93f074787372..5574ce8aa634 100644
--- a/.github/workflows/nv-h100.yml
+++ b/.github/workflows/nv-h100.yml
@@ -23,7 +23,7 @@ jobs:
       options: --gpus all --shm-size "8G"
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Check container state
         run: |
diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml
index 6b339f457802..f863226bfb95 100644
--- a/.github/workflows/nv-inference.yml
+++ b/.github/workflows/nv-inference.yml
@@ -25,7 +25,7 @@ jobs:
     runs-on: [self-hosted, nvidia, cu117, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml
index 31379f7e758b..8452c138c717 100644
--- a/.github/workflows/nv-mii.yml
+++ b/.github/workflows/nv-mii.yml
@@ -30,7 +30,7 @@ jobs:
     runs-on: [self-hosted, nvidia, cu117, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml
index ca091990cf4b..b1e8c042214f 100644
--- a/.github/workflows/nv-nightly.yml
+++ b/.github/workflows/nv-nightly.yml
@@ -18,7 +18,7 @@ jobs:
     runs-on: [self-hosted, nvidia, cu117, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
index 6e308242ecf0..6afc11fddaab 100644
--- a/.github/workflows/nv-pre-compile-ops.yml
+++ b/.github/workflows/nv-pre-compile-ops.yml
@@ -26,7 +26,7 @@ jobs:
       image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116
 
     steps:
-        - uses: actions/checkout@v3
+        - uses: actions/checkout@v4
 
         - name: environment
           run: |
diff --git a/.github/workflows/nv-sd.yml b/.github/workflows/nv-sd.yml
index 0af9517c5b59..b348d5ff931f 100644
--- a/.github/workflows/nv-sd.yml
+++ b/.github/workflows/nv-sd.yml
@@ -33,7 +33,7 @@ jobs:
       options: --gpus all --shm-size "8G"
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Check container state
         run: |
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
index 14d33680521d..3ca8ac43dfa4 100644
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -22,7 +22,7 @@ jobs:
     runs-on: [self-hosted, nvidia, cu117, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml
index bd13047f6078..257040439005 100644
--- a/.github/workflows/nv-torch-nightly-v100.yml
+++ b/.github/workflows/nv-torch-nightly-v100.yml
@@ -18,7 +18,7 @@ jobs:
     runs-on: [self-hosted, nvidia, cu117, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml
index 75f53c95c235..cfed6d6583e6 100644
--- a/.github/workflows/nv-transformers-v100.yml
+++ b/.github/workflows/nv-transformers-v100.yml
@@ -21,7 +21,7 @@ jobs:
     runs-on: [self-hosted, nvidia, cu117, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 8e016b4169cb..5a931125eff6 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -11,7 +11,7 @@ jobs:
     environment: release-env
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       with:
         ref: "master"
     - id: setup-venv
diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml
index 43a29598d12c..f83c58dc7ff5 100644
--- a/.github/workflows/xpu-max1100.yml
+++ b/.github/workflows/xpu-max1100.yml
@@ -40,7 +40,7 @@ jobs:
       options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Check container state
       shell: bash
       run: |

From 6dcced1d5c997876e9a1279f79ec8f9339561846 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Wed, 10 Apr 2024 08:39:24 -0700
Subject: [PATCH 07/36] Cleanup required_torch_version code and references.
 (#5370)

- Move `required_torch_version` check from deepspeed.runtime.utils to
deepspeed.utils.torch (newly created).
- Remove unused duplicate definition from `tests/unit/util.py`.
- Update all references to this function.
- Switch checks in `deepspeed/runtime/pipe/p2p.py` to use this function.
- Switch checks in `deepspeed/comm/torch.py` to use this function.

---------

Co-authored-by: Lev Kurilenko <113481193+lekurile@users.noreply.github.com>
---
 deepspeed/comm/torch.py                       | 28 +++----------------
 deepspeed/elasticity/utils.py                 |  2 +-
 deepspeed/runtime/comm/nccl.py                |  2 +-
 deepspeed/runtime/fp16/fused_optimizer.py     |  3 +-
 deepspeed/runtime/fp16/onebit/adam.py         |  2 +-
 deepspeed/runtime/fp16/onebit/lamb.py         |  2 +-
 deepspeed/runtime/fp16/onebit/zoadam.py       |  2 +-
 deepspeed/runtime/fp16/unfused_optimizer.py   |  3 +-
 deepspeed/runtime/pipe/p2p.py                 |  8 ++----
 deepspeed/runtime/utils.py                    | 15 ----------
 deepspeed/utils/torch.py                      | 22 +++++++++++++++
 tests/unit/alexnet_model.py                   |  2 +-
 tests/unit/checkpoint/test_mics_optimizer.py  |  2 +-
 tests/unit/checkpoint/test_moe_checkpoint.py  |  2 +-
 .../checkpoint/test_universal_checkpoint.py   |  2 +-
 tests/unit/checkpoint/test_zero_optimizer.py  |  2 +-
 tests/unit/compression/test_compression.py    |  2 +-
 .../quantization/test_intX_quantization.py    |  2 +-
 .../test_configurable_parallel_mp.py          |  2 +-
 .../test_configurable_parallel_pp.py          |  2 +-
 tests/unit/moe/test_moe.py                    |  2 +-
 tests/unit/moe/test_moe_tp.py                 |  2 +-
 .../flops_profiler/test_flops_profiler.py     |  2 +-
 .../runtime/compile/test_compile_wrapper.py   |  2 +-
 .../unit/runtime/compile/test_compile_zero.py |  2 +-
 .../unit/runtime/compile/test_load_config.py  |  2 +-
 .../half_precision/onebit/test_onebit.py      |  2 +-
 .../unit/runtime/half_precision/test_fp16.py  |  2 +-
 tests/unit/runtime/test_ds_initialize.py      |  3 +-
 tests/unit/util.py                            | 15 ----------
 30 files changed, 56 insertions(+), 85 deletions(-)
 create mode 100644 deepspeed/utils/torch.py

diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py
index d96c63d84e60..35c31f9fe2ec 100644
--- a/deepspeed/comm/torch.py
+++ b/deepspeed/comm/torch.py
@@ -9,6 +9,7 @@
 from .backend import *
 from .comm import *
 from ..runtime import compiler
+from deepspeed.utils.torch import required_torch_version
 import os
 
 DS_COMM_ALL_GATHER_OFF = False
@@ -18,40 +19,19 @@
 DS_COMM_REDUCE_OFF = False
 
 
-def is_torch_ver_eq_2_0():
-    TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2])
-    if TORCH_MAJOR == 2 and TORCH_MINOR == 0:
-        return True
-    return False
-
-
-def is_torch_ver_ge_2_1():
-    TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2])
-    if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1:
-        return True
-    return False
-
-
-def torch_ver_ge_1_13():
-    TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2])
-    if TORCH_MAJOR >= 1 and TORCH_MINOR >= 13:
-        return True
-    return False
-
-
 def has_coalescing_manager():
     has_c10d = hasattr(torch.distributed, 'distributed_c10d')
     return has_c10d and hasattr(torch.distributed.distributed_c10d, '_coalescing_manager')
 
 
 def has_all_reduce_coalesced():
-    return hasattr(torch.distributed, "all_reduce_coalesced") and torch_ver_ge_1_13()
+    return hasattr(torch.distributed, "all_reduce_coalesced") and required_torch_version(min_version=1.13)
 
 
 def get_coalescing_manager(group, device, reqs, async_op):
-    if is_torch_ver_eq_2_0():
+    if required_torch_version(min_version=2.0, max_version=2.0):
         return torch.distributed.distributed_c10d._coalescing_manager(group, device=device, reqs=reqs)
-    elif is_torch_ver_ge_2_1():
+    elif required_torch_version(min_version=2.1):
         return torch.distributed.distributed_c10d._coalescing_manager(group, device=device, async_ops=async_op)
     else:
         return torch.distributed.distributed_c10d._coalescing_manager(group, reqs)
diff --git a/deepspeed/elasticity/utils.py b/deepspeed/elasticity/utils.py
index 848ea8646449..78ae0352cf6e 100644
--- a/deepspeed/elasticity/utils.py
+++ b/deepspeed/elasticity/utils.py
@@ -3,7 +3,7 @@
 
 # DeepSpeed Team
 
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 
 
 def is_torch_elastic_compatible():
diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py
index 231f841ae8b7..a57b7519a295 100644
--- a/deepspeed/runtime/comm/nccl.py
+++ b/deepspeed/runtime/comm/nccl.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 from deepspeed.runtime.compression.cupy import CupyBackend
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.accelerator import get_accelerator
 
 
diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py
index 9ed250252e17..af8050c4646a 100755
--- a/deepspeed/runtime/fp16/fused_optimizer.py
+++ b/deepspeed/runtime/fp16/fused_optimizer.py
@@ -11,9 +11,10 @@
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
 from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer
-from deepspeed.runtime.utils import get_global_norm, get_grad_norm, CheckOverflow, get_weight_norm, required_torch_version, get_norm_with_moe_layers
+from deepspeed.runtime.utils import get_global_norm, get_grad_norm, CheckOverflow, get_weight_norm, get_norm_with_moe_layers
 from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE
 from deepspeed.utils import logger, log_dist
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, CLIP_GRAD
 from deepspeed.accelerator import get_accelerator
 from deepspeed.moe.utils import is_moe_param_group
diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py
index ae3e5f573850..f8a50393ac5d 100644
--- a/deepspeed/runtime/fp16/onebit/adam.py
+++ b/deepspeed/runtime/fp16/onebit/adam.py
@@ -7,7 +7,7 @@
 import torch
 import numpy as np
 from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed import comm as dist
 
 
diff --git a/deepspeed/runtime/fp16/onebit/lamb.py b/deepspeed/runtime/fp16/onebit/lamb.py
index 9cd2e0f25648..0f70782fd3ff 100644
--- a/deepspeed/runtime/fp16/onebit/lamb.py
+++ b/deepspeed/runtime/fp16/onebit/lamb.py
@@ -7,7 +7,7 @@
 import torch
 import numpy as np
 from deepspeed import comm as dist
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from deepspeed.accelerator import get_accelerator
 
diff --git a/deepspeed/runtime/fp16/onebit/zoadam.py b/deepspeed/runtime/fp16/onebit/zoadam.py
index 9ef671e7e3b7..bd75ccd4f7a0 100644
--- a/deepspeed/runtime/fp16/onebit/zoadam.py
+++ b/deepspeed/runtime/fp16/onebit/zoadam.py
@@ -7,7 +7,7 @@
 import torch
 import numpy as np
 from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed import comm as dist
 
 
diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py
index a7fd1910d7b2..530355f846e2 100755
--- a/deepspeed/runtime/fp16/unfused_optimizer.py
+++ b/deepspeed/runtime/fp16/unfused_optimizer.py
@@ -12,9 +12,10 @@
 from torch._utils import _flatten_dense_tensors
 
 from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer
-from deepspeed.runtime.utils import get_global_norm, CheckOverflow, get_weight_norm, required_torch_version
+from deepspeed.runtime.utils import get_global_norm, CheckOverflow, get_weight_norm
 from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE
 from deepspeed.utils import logger
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT
 from deepspeed.accelerator import get_accelerator
 from deepspeed import comm as dist
diff --git a/deepspeed/runtime/pipe/p2p.py b/deepspeed/runtime/pipe/p2p.py
index 31b9a14bf0f4..2b12a9573c4b 100644
--- a/deepspeed/runtime/pipe/p2p.py
+++ b/deepspeed/runtime/pipe/p2p.py
@@ -9,9 +9,7 @@
 import torch
 from deepspeed import comm as dist
 
-# To query whether we have send/recv support
-from packaging.version import Version
-from deepspeed.git_version_info import torch_info
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.accelerator import get_accelerator
 
 _groups = None
@@ -21,9 +19,7 @@
 
 
 def can_send_recv() -> bool:
-    torch_version = Version(torch_info['version'])
-    sendrecv_min = Version('1.8')
-    return torch_version >= sendrecv_min
+    return required_torch_version(min_version=1.8)
 
 
 #initializes adjacent process groups
diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index 9d561f7271eb..c55f8a0e2995 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -14,7 +14,6 @@
 import psutil
 import gc
 from math import sqrt
-from packaging import version as pkg_version
 
 import torch
 from deepspeed import comm as dist
@@ -1036,20 +1035,6 @@ def get_inactive_params(param_list):
                             param.ds_status == ZeroParamStatus.NOT_AVAILABLE)]
 
 
-def required_torch_version(min_version=None, max_version=None):
-    assert min_version or max_version, "Must provide a min_version or max_version argument"
-
-    torch_version = pkg_version.parse(torch.__version__)
-
-    if min_version and pkg_version.parse(str(min_version)) > torch_version:
-        return False
-
-    if max_version and pkg_version.parse(str(max_version)) < torch_version:
-        return False
-
-    return True
-
-
 def get_norm_with_moe_layers(non_expert_norm, mpu, expert_tensors, norm_type=2):
     """ Compute the global norm with MoE experts
 
diff --git a/deepspeed/utils/torch.py b/deepspeed/utils/torch.py
new file mode 100644
index 000000000000..eb22d3561035
--- /dev/null
+++ b/deepspeed/utils/torch.py
@@ -0,0 +1,22 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from packaging import version as pkg_version
+
+import torch
+
+
+def required_torch_version(min_version=None, max_version=None):
+    assert min_version or max_version, "Must provide a min_version or max_version argument"
+
+    torch_version = pkg_version.parse(torch.__version__)
+
+    if min_version and pkg_version.parse(str(min_version)) > torch_version:
+        return False
+
+    if max_version and pkg_version.parse(str(max_version)) < torch_version:
+        return False
+
+    return True
diff --git a/tests/unit/alexnet_model.py b/tests/unit/alexnet_model.py
index cf533063d6ec..25256d376eeb 100644
--- a/tests/unit/alexnet_model.py
+++ b/tests/unit/alexnet_model.py
@@ -11,7 +11,7 @@
 import deepspeed
 import deepspeed.comm as dist
 import deepspeed.runtime.utils as ds_utils
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.accelerator import get_accelerator
 from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec
 
diff --git a/tests/unit/checkpoint/test_mics_optimizer.py b/tests/unit/checkpoint/test_mics_optimizer.py
index 3f853cd5c13a..9e56bf3446fa 100644
--- a/tests/unit/checkpoint/test_mics_optimizer.py
+++ b/tests/unit/checkpoint/test_mics_optimizer.py
@@ -8,7 +8,7 @@
 
 import deepspeed
 
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from unit.common import DistributedTest
 from unit.simple_model import *
 from unit.checkpoint.common import *
diff --git a/tests/unit/checkpoint/test_moe_checkpoint.py b/tests/unit/checkpoint/test_moe_checkpoint.py
index 36efe2a69002..89878b5d8fa9 100644
--- a/tests/unit/checkpoint/test_moe_checkpoint.py
+++ b/tests/unit/checkpoint/test_moe_checkpoint.py
@@ -4,7 +4,7 @@
 # DeepSpeed Team
 
 from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 
 from unit.common import DistributedTest
 from unit.simple_model import *
diff --git a/tests/unit/checkpoint/test_universal_checkpoint.py b/tests/unit/checkpoint/test_universal_checkpoint.py
index 7adfe8410b55..e0c4f4745043 100644
--- a/tests/unit/checkpoint/test_universal_checkpoint.py
+++ b/tests/unit/checkpoint/test_universal_checkpoint.py
@@ -7,7 +7,7 @@
 from types import SimpleNamespace
 from torch.utils._pytree import tree_map
 
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.checkpoint import UNIVERSAL_CHECKPOINT_INFO
 from deepspeed.checkpoint.ds_to_universal import main as convert_to_universal
 
diff --git a/tests/unit/checkpoint/test_zero_optimizer.py b/tests/unit/checkpoint/test_zero_optimizer.py
index 2312425c8aed..84b4eca6e2ca 100644
--- a/tests/unit/checkpoint/test_zero_optimizer.py
+++ b/tests/unit/checkpoint/test_zero_optimizer.py
@@ -8,7 +8,7 @@
 from deepspeed.ops.op_builder import CPUAdamBuilder
 from deepspeed.checkpoint.utils import clone_tensors_for_torch_save, get_model_ckpt_name_for_rank
 from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 
 from unit.common import DistributedTest, DistributedFixture
 from unit.simple_model import *
diff --git a/tests/unit/compression/test_compression.py b/tests/unit/compression/test_compression.py
index c6e5031349cb..1802c09f33b5 100644
--- a/tests/unit/compression/test_compression.py
+++ b/tests/unit/compression/test_compression.py
@@ -14,7 +14,7 @@
 from deepspeed.compression.basic_layer import LinearLayer_Compress, ColumnParallelLinear_Compress, RowParallelLinear_Compress
 from deepspeed.compression.helper import convert_conv1d_to_linear
 from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from unit.common import DistributedTest
 
 pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5),
diff --git a/tests/unit/inference/quantization/test_intX_quantization.py b/tests/unit/inference/quantization/test_intX_quantization.py
index fd6a8e5ad2e1..77b51fcd5814 100644
--- a/tests/unit/inference/quantization/test_intX_quantization.py
+++ b/tests/unit/inference/quantization/test_intX_quantization.py
@@ -11,7 +11,7 @@
 from deepspeed.inference.quantization.quantization import _init_group_wise_weight_quantization
 from deepspeed.inference.quantization.utils import Quantizer, DeQuantizer
 from deepspeed.inference.quantization.layers import QuantizedLinear
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from transformers.models.opt.modeling_opt import OPTDecoderLayer
 from transformers import AutoConfig, OPTConfig, AutoModel
 import pytest
diff --git a/tests/unit/model_parallelism/test_configurable_parallel_mp.py b/tests/unit/model_parallelism/test_configurable_parallel_mp.py
index 824ecea5f144..cca1ef3584ad 100644
--- a/tests/unit/model_parallelism/test_configurable_parallel_mp.py
+++ b/tests/unit/model_parallelism/test_configurable_parallel_mp.py
@@ -13,7 +13,7 @@
 from deepspeed.accelerator import get_accelerator
 from unit.common import DistributedTest, DistributedFixture
 from unit.megatron_model import get_gpt2_model, get_megatron_version
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 
 pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5, max_version=1.13),
                                 reason='Megatron-LM package requires Pytorch version >=1.5 and <=1.13')
diff --git a/tests/unit/model_parallelism/test_configurable_parallel_pp.py b/tests/unit/model_parallelism/test_configurable_parallel_pp.py
index b500b9d857a5..e50fd18577b1 100644
--- a/tests/unit/model_parallelism/test_configurable_parallel_pp.py
+++ b/tests/unit/model_parallelism/test_configurable_parallel_pp.py
@@ -15,7 +15,7 @@
 from unit.megatron_model import MockGPT2ModelPipe as GPT2ModelPipe
 from deepspeed.utils import RepeatingLoader
 from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 
 pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5, max_version=1.13),
                                 reason='Megatron-LM package requires Pytorch version >=1.5 and <=1.13')
diff --git a/tests/unit/moe/test_moe.py b/tests/unit/moe/test_moe.py
index 0ec45933ff66..d39f9fe3d651 100644
--- a/tests/unit/moe/test_moe.py
+++ b/tests/unit/moe/test_moe.py
@@ -13,7 +13,7 @@
 from deepspeed import get_accelerator
 from deepspeed.moe.sharded_moe import top1gating
 from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer, is_moe_param
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 
 
 @pytest.mark.parametrize("zero_stage", [0, 1, 2])
diff --git a/tests/unit/moe/test_moe_tp.py b/tests/unit/moe/test_moe_tp.py
index 0069c674690c..eb4668015c01 100644
--- a/tests/unit/moe/test_moe_tp.py
+++ b/tests/unit/moe/test_moe_tp.py
@@ -7,7 +7,7 @@
 import deepspeed
 import pytest
 from unit.common import DistributedTest
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.moe.layer import MoE
 
 
diff --git a/tests/unit/profiling/flops_profiler/test_flops_profiler.py b/tests/unit/profiling/flops_profiler/test_flops_profiler.py
index bbcb01b489f4..c72deecf287f 100644
--- a/tests/unit/profiling/flops_profiler/test_flops_profiler.py
+++ b/tests/unit/profiling/flops_profiler/test_flops_profiler.py
@@ -9,7 +9,7 @@
 from deepspeed.profiling.flops_profiler import get_model_profile
 from unit.simple_model import SimpleModel, random_dataloader
 from unit.common import DistributedTest
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.accelerator import get_accelerator
 
 if torch.half not in get_accelerator().supported_dtypes():
diff --git a/tests/unit/runtime/compile/test_compile_wrapper.py b/tests/unit/runtime/compile/test_compile_wrapper.py
index 0bebeed117b4..d1830534f6ea 100644
--- a/tests/unit/runtime/compile/test_compile_wrapper.py
+++ b/tests/unit/runtime/compile/test_compile_wrapper.py
@@ -8,7 +8,7 @@
 
 import deepspeed
 from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 
 from unit.common import DistributedTest
 
diff --git a/tests/unit/runtime/compile/test_compile_zero.py b/tests/unit/runtime/compile/test_compile_zero.py
index 79ab5efd5099..7568c27e3ed2 100644
--- a/tests/unit/runtime/compile/test_compile_zero.py
+++ b/tests/unit/runtime/compile/test_compile_zero.py
@@ -7,7 +7,7 @@
 import torch
 
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.accelerator import get_accelerator
 
 from unit.runtime.compile.util import compare_loss
diff --git a/tests/unit/runtime/compile/test_load_config.py b/tests/unit/runtime/compile/test_load_config.py
index f3c53ede91fd..601adae58884 100644
--- a/tests/unit/runtime/compile/test_load_config.py
+++ b/tests/unit/runtime/compile/test_load_config.py
@@ -9,7 +9,7 @@
 from unit.simple_model import SimpleModel
 import deepspeed
 from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 
 from unit.common import DistributedTest
 
diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py
index 32ee262f3714..1be2d73ef4c0 100644
--- a/tests/unit/runtime/half_precision/onebit/test_onebit.py
+++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py
@@ -17,7 +17,7 @@
 from unit.common import DistributedTest
 from unit.simple_model import SimpleModel, random_dataloader
 from unit.alexnet_model import AlexNetPipe, train_cifar
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.accelerator import get_accelerator
 
 PipeTopo = PipeDataParallelTopology
diff --git a/tests/unit/runtime/half_precision/test_fp16.py b/tests/unit/runtime/half_precision/test_fp16.py
index 9229794b39f8..5b300053d2a8 100644
--- a/tests/unit/runtime/half_precision/test_fp16.py
+++ b/tests/unit/runtime/half_precision/test_fp16.py
@@ -10,7 +10,7 @@
 from deepspeed.ops.adam import FusedAdam
 from unit.common import DistributedTest
 from unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader, SimpleMoEModel, sequence_dataloader
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import CPUAdamBuilder
 from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer
diff --git a/tests/unit/runtime/test_ds_initialize.py b/tests/unit/runtime/test_ds_initialize.py
index 1f3cc991eba7..169096a6d4e5 100644
--- a/tests/unit/runtime/test_ds_initialize.py
+++ b/tests/unit/runtime/test_ds_initialize.py
@@ -17,7 +17,8 @@
 from deepspeed.ops.adam import FusedAdam
 from deepspeed.runtime.lr_schedules import WARMUP_LR, WarmupLR
 from deepspeed.runtime.config import ADAM_OPTIMIZER
-from deepspeed.runtime.utils import see_memory_usage, required_torch_version
+from deepspeed.runtime.utils import see_memory_usage
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.accelerator import get_accelerator
 
 
diff --git a/tests/unit/util.py b/tests/unit/util.py
index 5acf580ed3c8..feec326ede6c 100644
--- a/tests/unit/util.py
+++ b/tests/unit/util.py
@@ -7,7 +7,6 @@
 import torch
 from deepspeed.accelerator import get_accelerator, is_current_accelerator_supported
 from deepspeed.git_version_info import torch_info
-from packaging import version as pkg_version
 
 
 def skip_on_arch(min_arch=7):
@@ -62,20 +61,6 @@ def bf16_required_version_check(accelerator_check=True):
         return False
 
 
-def required_torch_version(min_version=None, max_version=None):
-    assert min_version or max_version, "Must provide a min_version or max_version argument"
-
-    torch_version = pkg_version.parse(torch.__version__)
-
-    if min_version and pkg_version.parse(str(min_version)) > torch_version:
-        return False
-
-    if max_version and pkg_version.parse(str(max_version)) < torch_version:
-        return False
-
-    return True
-
-
 def required_amp_check():
     from importlib.util import find_spec
     if find_spec('apex') is None:

From cc9e7b9c79ae48a7e93b9089e264eafb919178ab Mon Sep 17 00:00:00 2001
From: Liangliang-Ma <1906710196@qq.com>
Date: Thu, 11 Apr 2024 00:46:00 +0800
Subject: [PATCH 08/36] Update README.md for intel XPU support (#5389)

1. fix a type error
2. update the intel xpu HW support status
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5f092a56c117..a1335caa4949 100755
--- a/README.md
+++ b/README.md
@@ -133,7 +133,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram
 | AMD | [![amd-mi200](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml) |
 | CPU | [![torch-latest-cpu](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml) [![cpu-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml) |
 | Intel Gaudi | [![hpu-gaudi2](https://github.com/microsoft/DeepSpeed/actions/workflows/hpu-gaudi2.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/hpu-gaudi2.yml) |
-| Intel XPU | [![xpu-max1100](https://github.com/microsoft/DeepSpeed/actions/workflows/xpu-max1100.yml/badge.svg)?branch=master](https://github.com/microsoft/DeepSpeed/actions/workflows/xpu-max1100.yml) |
+| Intel XPU | [![xpu-max1100](https://github.com/microsoft/DeepSpeed/actions/workflows/xpu-max1100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/xpu-max1100.yml) |
 | PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
 | Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) |
 | Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) |
@@ -163,7 +163,7 @@ dynamically link them at runtime.
 | ----------- | -------- | ---------------- | --------------------- | ------------------ |
 | Intel | Intel(R) Gaudi(R) 2 AI accelerator | hpu | Yes | Yes |
 | Intel | Intel(R) Xeon(R) Processors | cpu | Yes | Yes |
-| Intel | Intel(R) Data Center GPU Max series | xpu | Yes | No |
+| Intel | Intel(R) Data Center GPU Max series | xpu | Yes | Yes |
 
 ## PyPI
 We regularly push releases to [PyPI](https://pypi.org/project/deepspeed/) and encourage users to install from there in most cases.

From a8b821535aa0b254efa681d51b4951734ca021cc Mon Sep 17 00:00:00 2001
From: Reza Yazdani <44502768+RezaYazdaniAminabadi@users.noreply.github.com>
Date: Wed, 10 Apr 2024 11:50:47 -0700
Subject: [PATCH 09/36] Optimize the fp-dequantizer to get high memory-BW
 utilization (#5373)

This PR removes the for loop inside the dequantizer kernel and use as
many threads and blocks as needed to dequantize the quantized matrix.
The previous implementation was processing each group per thread block
which can reduce the efficiency when have having smaller group-size and
also processes more data per-thread which is unnecessary and we can use
more parallelism to improve the dequantization performance.

Based on my testing results, for a 4K by 4K matrix, dequantizing from
fp8 to bf16 gives 2.5x speedup (improving the BW efficiency from 1 TB/s
to 2.5 TB/s on Nvidia H100 GPU).

---------

Co-authored-by: Reza Yazdani <reza.yazdani@snowflake.com>
---
 csrc/fp_quantizer/quantize.cu | 168 ++++++++++++++--------------------
 1 file changed, 68 insertions(+), 100 deletions(-)

diff --git a/csrc/fp_quantizer/quantize.cu b/csrc/fp_quantizer/quantize.cu
index 37be6cc0657c..5f0b58f124f0 100644
--- a/csrc/fp_quantizer/quantize.cu
+++ b/csrc/fp_quantizer/quantize.cu
@@ -219,119 +219,100 @@ __global__ void apply_quantization(T* val,
 }
 
 template <typename T,
-          int unroll,
           int q_mantisa_bits,
           int total_q_bits = 16,
           int _mantisa_bits = 3,
           int _exponent_bits = 4>
-__global__ void apply_dequantization(uint8_t* val, T* q_val, int group_size)
+__global__ void apply_dequantization(uint8_t* val, T* q_val, int group_size, int total_num_elements)
 {
-    int tidx = threadIdx.x;
-    int wid = tidx >> 5;
-    int lane = tidx & 0x1f;
-    int gid = blockIdx.x * quantization::warps + wid;
+    constexpr uint32_t vector_size = quantization::access_granularity / sizeof(T);
+    int tidx = (blockIdx.x * blockDim.x + threadIdx.x) * vector_size;
+
     constexpr int quantized_bits = _mantisa_bits + _exponent_bits + 1;
     constexpr int q_exponent_bits = total_q_bits - q_mantisa_bits - 1;
     constexpr uint16_t _mantisa_mask = (1 << _mantisa_bits) - 1;
     constexpr uint16_t _exponent_mask = ((1 << _exponent_bits) - 1) << _mantisa_bits;
     constexpr uint16_t _sign_mask = 1 << (_mantisa_bits + _exponent_bits);
-
-    constexpr uint32_t vector_size = quantization::access_granularity / sizeof(T);
-    constexpr uint32_t load_stride = vector_size * hw_warp_size;
-    const uint32_t thread_offset = lane * vector_size;
-    const uint32_t thread_load_offset = lane * vector_size * quantized_bits / 8;
-    const uint32_t base_load_offset =
-        gid * (group_size * quantized_bits / 8 + 4) + thread_load_offset;  // 4-byte scale offset
-    const uint32_t base_store_offset = gid * group_size + thread_offset;
-    const uint8_t* load_base_ptr = val + base_load_offset;
+    const uint32_t g_index = (tidx / group_size);
+    const uint32_t group_size_bytes = (group_size * quantized_bits / 8);
+    const uint8_t* load_base_ptr =
+        val + g_index * (group_size_bytes + 4) + (tidx % group_size) * quantized_bits / 8;
 
     int mantisa_mask = ((1 << q_mantisa_bits) - 1);
     mantisa_mask <<= (_mantisa_bits - q_mantisa_bits);
 
-    T* store_base_ptr = q_val + base_store_offset;
-    float scale;  //= q_scale[gid];
+    T* store_base_ptr = q_val + tidx;
+    float scale;
 
     uint8_t* scale_as_int8 = reinterpret_cast<uint8_t*>(&scale);
     if (quantized_bits == 6) {
         mem_access::load_global<quantization::quanitzed_access_granularity>(
-            scale_as_int8,
-            val + gid * (group_size * quantized_bits / 8 + 4) + (group_size * quantized_bits / 8));
+            scale_as_int8, val + g_index * (group_size_bytes + 4) + group_size_bytes);
         mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
             scale_as_int8 + quantization::quanitzed_access_granularity_6bits,
-            val + gid * (group_size * quantized_bits / 8 + 4) + (group_size * quantized_bits / 8) +
+            val + g_index * (group_size_bytes + 4) + group_size_bytes +
                 quantization::quanitzed_access_granularity_6bits);
     } else
         mem_access::load_global<quantization::quanitzed_access_granularity>(
-            scale_as_int8,
-            val + gid * (group_size * quantized_bits / 8 + 4) + (group_size * quantized_bits / 8));
-
-#pragma unroll
-    for (int i = 0; i < unroll; i++) {
-        if (i * load_stride + thread_offset < group_size) {
-            uint64_t q_buf_in;
-            uint64_t q_buf_in1;
-            uint8_t* int8_data = reinterpret_cast<uint8_t*>(&q_buf_in);
-            uint8_t* int8_data1 = reinterpret_cast<uint8_t*>(&q_buf_in1);
-            uint32_t loading_offset = i * load_stride * quantized_bits / 8;
-            if (quantized_bits == 6) {
-                mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
-                    int8_data, load_base_ptr + loading_offset);
-                mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
-                    int8_data + quantization::quanitzed_access_granularity_6bits,
-                    load_base_ptr + loading_offset +
-                        quantization::quanitzed_access_granularity_6bits);
-                mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
-                    int8_data + quantization::quanitzed_access_granularity_6bits * 2,
-                    load_base_ptr + loading_offset +
-                        quantization::quanitzed_access_granularity_6bits * 2);
-            } else {
+            scale_as_int8, val + g_index * (group_size_bytes + 4) + group_size_bytes);
+
+    if (tidx < total_num_elements) {
+        uint64_t q_buf_in;
+        uint64_t q_buf_in1;
+        uint8_t* int8_data = reinterpret_cast<uint8_t*>(&q_buf_in);
+        uint8_t* int8_data1 = reinterpret_cast<uint8_t*>(&q_buf_in1);
+        if (quantized_bits == 6) {
+            mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
+                int8_data, load_base_ptr);
+            mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
+                int8_data + quantization::quanitzed_access_granularity_6bits,
+                load_base_ptr + quantization::quanitzed_access_granularity_6bits);
+            mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
+                int8_data + quantization::quanitzed_access_granularity_6bits * 2,
+                load_base_ptr + quantization::quanitzed_access_granularity_6bits * 2);
+        } else {
+            mem_access::load_global<quantization::quanitzed_access_granularity>(int8_data,
+                                                                                load_base_ptr);
+            if (quantized_bits > 4) {
                 mem_access::load_global<quantization::quanitzed_access_granularity>(
-                    int8_data, load_base_ptr + loading_offset);
-                if (quantized_bits > 4) {
+                    int8_data + quantization::quanitzed_access_granularity,
+                    load_base_ptr + quantization::quanitzed_access_granularity);
+                if (quantized_bits == 12) {
                     mem_access::load_global<quantization::quanitzed_access_granularity>(
-                        int8_data + quantization::quanitzed_access_granularity,
-                        load_base_ptr + loading_offset +
-                            quantization::quanitzed_access_granularity);
-                    if (quantized_bits == 12) {
-                        mem_access::load_global<quantization::quanitzed_access_granularity>(
-                            int8_data1,
-                            load_base_ptr + loading_offset +
-                                quantization::quanitzed_access_granularity * 2);
-                    }
+                        int8_data1, load_base_ptr + quantization::quanitzed_access_granularity * 2);
                 }
             }
-            T store_buf[vector_size];
-            uint16_t* q_buf = reinterpret_cast<uint16_t*>(store_buf);
+        }
+        T store_buf[vector_size];
+        uint16_t* q_buf = reinterpret_cast<uint16_t*>(store_buf);
 #pragma unroll
-            for (int j = 0; j < vector_size; j++) {
-                uint16_t new_data;
-                if (j < 5 || quantized_bits != 12) {
-                    new_data = (uint16_t)(q_buf_in >> (j * quantized_bits));
-                } else {
-                    if (j == 5) {
-                        new_data = (uint16_t)(q_buf_in1);
-                        new_data = (uint16_t)((new_data << 4) | (q_buf_in >> 60));
-                    } else
-                        new_data = (uint16_t)(q_buf_in1 >> ((j - 6) * quantized_bits + 8));
-                }
+        for (int j = 0; j < vector_size; j++) {
+            uint16_t new_data;
+            if (j < 5 || quantized_bits != 12) {
+                new_data = (uint16_t)(q_buf_in >> (j * quantized_bits));
+            } else {
+                if (j == 5) {
+                    new_data = (uint16_t)(q_buf_in1);
+                    new_data = (uint16_t)((new_data << 4) | (q_buf_in >> 60));
+                } else
+                    new_data = (uint16_t)(q_buf_in1 >> ((j - 6) * quantized_bits + 8));
+            }
 
-                uint16_t sign = (new_data & _sign_mask) >> (_mantisa_bits + _exponent_bits);
-                uint16_t dst_exponent = (new_data & _exponent_mask) >> _mantisa_bits;
-                uint16_t dst_mantisa = (new_data & _mantisa_mask);
+            uint16_t sign = (new_data & _sign_mask) >> (_mantisa_bits + _exponent_bits);
+            uint16_t dst_exponent = (new_data & _exponent_mask) >> _mantisa_bits;
+            uint16_t dst_mantisa = (new_data & _mantisa_mask);
 
-                if (dst_exponent != (1 << q_exponent_bits) - 1)
-                    dst_exponent = (dst_exponent - ((1 << (_exponent_bits - 1)) - 1)) +
-                                   (1 << (q_exponent_bits - 1)) - 1;
+            if (dst_exponent != (1 << q_exponent_bits) - 1)
+                dst_exponent = (dst_exponent - ((1 << (_exponent_bits - 1)) - 1)) +
+                               (1 << (q_exponent_bits - 1)) - 1;
 
-                q_buf[j] = ((sign << (q_exponent_bits + q_mantisa_bits)) |
-                            (dst_exponent << q_mantisa_bits) |
-                            (dst_mantisa << (q_mantisa_bits - _mantisa_bits)));
-                float up_cast = conversion::to<float>(store_buf[j]);
-                store_buf[j] = conversion::to<T>(up_cast * scale);
-            }
-            mem_access::store_global<quantization::access_granularity>(
-                store_base_ptr + i * load_stride, store_buf);
+            q_buf[j] =
+                ((sign << (q_exponent_bits + q_mantisa_bits)) | (dst_exponent << q_mantisa_bits) |
+                 (dst_mantisa << (q_mantisa_bits - _mantisa_bits)));
+            float up_cast = conversion::to<float>(store_buf[j]);
+            store_buf[j] = conversion::to<T>(up_cast * scale);
         }
+        mem_access::store_global<quantization::access_granularity>(store_base_ptr, store_buf);
     }
 }
 
@@ -386,12 +367,6 @@ INSTANTIATE_LAUNCH_QUANTIZATION(__nv_bfloat16, 23, 8);
 #endif
 INSTANTIATE_LAUNCH_QUANTIZATION(__half, 23, 8);
 
-#define LAUNCH_FOR_DEQUANTIZATION_UNROLL(COUNT)                                                  \
-    case COUNT:                                                                                  \
-        apply_dequantization<T, COUNT, mantisa, 16, CONST_Q_MANTISA_BITS, CONST_Q_EXPONENT_BITS> \
-            <<<grid, block, 0, stream>>>(val, q_val, group_size);                                \
-        break;
-
 template <typename T, int mantisa>
 void launch_dequantization(uint8_t* val,
                            T* q_val,
@@ -401,21 +376,14 @@ void launch_dequantization(uint8_t* val,
                            int q_exponent_bits,
                            cudaStream_t stream)
 {
-    const dim3 grid((num_groups + quantization::warps - 1) / quantization::warps);
+    int blocks = ((num_groups * group_size) - 1) /
+                     (quantization::threads * (quantization::access_granularity / sizeof(T))) +
+                 1;
+    const dim3 grid(blocks);
     const dim3 block(quantization::threads);
-
-    constexpr int vals_per_unroll = hw_warp_size * quantization::access_granularity / sizeof(T);
-    const int copy_unroll = (group_size + vals_per_unroll - 1) / vals_per_unroll;
-
     DEQUANT_SWITCH(q_mantisa_bits * q_exponent_bits, [&] {
-        switch (copy_unroll) {
-            LAUNCH_FOR_DEQUANTIZATION_UNROLL(1)
-            LAUNCH_FOR_DEQUANTIZATION_UNROLL(2)
-            LAUNCH_FOR_DEQUANTIZATION_UNROLL(3)
-            LAUNCH_FOR_DEQUANTIZATION_UNROLL(4)
-            LAUNCH_FOR_DEQUANTIZATION_UNROLL(5)
-            LAUNCH_FOR_DEQUANTIZATION_UNROLL(6)
-        }
+        apply_dequantization<T, mantisa, 16, CONST_Q_MANTISA_BITS, CONST_Q_EXPONENT_BITS>
+            <<<grid, block, 0, stream>>>(val, q_val, group_size, (num_groups * group_size));
     });
 }
 #define INSTANTIATE_LAUNCH_DEQUANTIZATION(T, mantisa) \

From 2c51aba0b70a053252c6b0c95ecdecebb7b64d43 Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Sat, 13 Apr 2024 13:40:31 -0700
Subject: [PATCH 10/36] Add custom reshaping for universal checkpoint (#5390)

This PR adds more flexibility to define weight tensor reshaping for
universal checkpointing.

Currently universal checkpointing assumes a few patterns of partitioning
for tensor parallelism, such as column/row wise partitioning of a 2-dim
tensor. However, these are not flexible enough to define partitioning
for more complex usages. Here are some examples:

1) MoE: The user may define the weight tensor for MoE's FFN as
[n_experts * hidden_out, hidden_in]. For TP, we need to *view* this
tensor as 3-dim tensor and partition it along `hidden_out` dimension.
2) GQA: The weights for QKV are often represented as one tensor and we
may have Q, K and V with different sizes. The tensor shape will be
[q_size + k_size + v_size, hidden]. We partition this along first
dimension but for each Q, K, and V. In this case, we first need to
partition Q, V, and V separately and then concatenate them to get a
shard for TP.

We propose a new pattern `PARAMETER_WITH_SUB_PARAMS` to support this.
Here is the usage to cover the above use cases. You can define the view
of the weight tensor and specify the dimension for partitioning based on
the view.
```python
from deepspeed.checkpoint import PARAMETER_WITH_SUB_PARAMS, SubparamShape

info[PARAMETER_WITH_SUB_PARAMS] = [
    asdict(SubparamShape(patterns=[layers_prefix + r"\d+moe.fc1.weight"],
                shape=(num_experts, hidden_out, hidden_in), partition_dim=1)),
    asdict(SubparamShape(patterns=[layers_prefix + r"\d+.qkv.weight"],
                shape=((q_size, k_size, v_size), hidden_size), partition_dim=0)),
...
]
```

The conversion script (`ds_to_universal.py`) merges TP-sharded weight
tensors and the loader of universal checkpoints also partitions them
following the information.

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/checkpoint/__init__.py             |  2 +-
 deepspeed/checkpoint/constants.py            |  4 +++
 deepspeed/checkpoint/ds_to_universal.py      | 38 ++++++++++++++++++++
 deepspeed/checkpoint/universal_checkpoint.py | 33 +++++++++++++++--
 4 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/deepspeed/checkpoint/__init__.py b/deepspeed/checkpoint/__init__.py
index c9822693867d..1f645a494ade 100644
--- a/deepspeed/checkpoint/__init__.py
+++ b/deepspeed/checkpoint/__init__.py
@@ -15,6 +15,6 @@
 
 from .zero_checkpoint import ZeROCheckpoint
 
-from .universal_checkpoint import enable_universal_checkpoint
+from .universal_checkpoint import enable_universal_checkpoint, SubparamShape
 
 from .constants import *
diff --git a/deepspeed/checkpoint/constants.py b/deepspeed/checkpoint/constants.py
index b3f199a67b98..046bc242002f 100644
--- a/deepspeed/checkpoint/constants.py
+++ b/deepspeed/checkpoint/constants.py
@@ -74,6 +74,8 @@
 # Similarly, load_hp_checkpoint_state has to take the needed actions when loading from universal.
 PARAM_N_SUB_PARAMS = "param_n_sub_params"
 
+SUB_PARAM_SHAPE = "sub_param_shape"
+
 # Regex list of parameters that require special handling
 VOCABULARY_PARAMETER_PATTERNS = 'vocabulary_parameter_patterns'
 PIPELINE_REPLICATED_PARAMETER_PATTERNS = 'pipeline_replicated_parameter_patterns'
@@ -81,3 +83,5 @@
 PARAMETER_WITH_ROW_PARALLELISM_PATTERNS = 'parameter_with_row_parallelism_patterns'
 TP_REPLICATED_PARAMETER_PATTERNS = 'tp_replicated_parameter_patterns'
 PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0 = 'parameter_with_2_sub_params_cat_dim_0'
+PARAMETER_WITH_SUB_PARAMS = 'parameter_with_sub_params'
+SUB_PARAMS_SHAPE = 'sub_params_shape'
diff --git a/deepspeed/checkpoint/ds_to_universal.py b/deepspeed/checkpoint/ds_to_universal.py
index 9ec5d0b169e4..d5eca81c804f 100755
--- a/deepspeed/checkpoint/ds_to_universal.py
+++ b/deepspeed/checkpoint/ds_to_universal.py
@@ -6,6 +6,7 @@
 # DeepSpeed Team
 
 from functools import partial
+from itertools import chain
 import argparse
 import glob
 import itertools
@@ -28,6 +29,7 @@
     PARAM,
     CAT_DIM,
     PARAM_N_SUB_PARAMS,
+    SUB_PARAM_SHAPE,
     VOCAB_TENSOR,
     UNIVERSAL_CHECKPOINT_INFO,
     VOCABULARY_PARAMETER_PATTERNS,
@@ -36,6 +38,8 @@
     PARAMETER_TO_AVERAGE_PATTERNS,
     PARAMETER_WITH_ROW_PARALLELISM_PATTERNS,
     PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0,
+    PARAMETER_WITH_SUB_PARAMS,
+    SubparamShape,
 )
 
 
@@ -180,8 +184,11 @@ def merge_tp_slices(ds_checkpoint, dir, slice_dir, tp_degree, name_and_shape):
     parameters_with_row_parallelism = universal_checkpoint_info.get(PARAMETER_WITH_ROW_PARALLELISM_PATTERNS, [])
     vocabulary_parameters = universal_checkpoint_info.get(VOCABULARY_PARAMETER_PATTERNS, [])
     parameters_with_2_sub_params_cat_dim_0 = universal_checkpoint_info.get(PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0, [])
+    parameter_with_sub_params = universal_checkpoint_info.get(PARAMETER_WITH_SUB_PARAMS, [])
+
     unmatched_patterns = set(replicated_parameters + parameters_to_average + parameters_with_row_parallelism +
                              vocabulary_parameters + parameters_with_2_sub_params_cat_dim_0)
+    unmatched_patterns.update(chain.from_iterable(SubparamShape(**s).patterns for s in parameter_with_sub_params))
 
     def get_matched_pattern(patterns_, name_):
         matched_ = [pattern_ for pattern_ in patterns_ if re.match(pattern_, name_)]
@@ -192,6 +199,17 @@ def get_matched_pattern(patterns_, name_):
             return pattern_
         return None
 
+    def get_matched_sub_params_pattern(name_):
+        for subparam_shape_dict in parameter_with_sub_params:
+            subparam_shape = SubparamShape(**subparam_shape_dict)
+            for pattern_ in subparam_shape.patterns:
+                if re.match(pattern_, name_):
+                    unmatched_patterns.discard(pattern_)
+                    return subparam_shape
+        return None
+
+    matched_sub_params_shape = get_matched_sub_params_pattern(name)
+
     step_merged = _merge_zero_shards(slice_base_path, "step", tp_degree, shape)
     if step_merged:
         _save_checkpoint(os.path.join(param_base_path, f"step.pt"), step_merged[0])
@@ -219,6 +237,26 @@ def get_matched_pattern(patterns_, name_):
             param = torch.cat([merged_chunks_0, merged_chunks_1], dim=cat_dim)
             ckpt_dict[CAT_DIM] = cat_dim
             ckpt_dict[PARAM_N_SUB_PARAMS] = 2
+        elif matched_sub_params_shape:
+            merged_chunks = []
+            partition_dim = matched_sub_params_shape.partition_dim
+
+            sub_dim_sizes = matched_sub_params_shape.shape[partition_dim]
+            if not isinstance(sub_dim_sizes, tuple):
+                sub_dim_sizes = (sub_dim_sizes, )
+
+            partition_shape = [sum(d) if isinstance(d, tuple) else d for d in matched_sub_params_shape.shape]
+            partition_shape = [d // tp_degree if i == partition_dim else d for i, d in enumerate(partition_shape)]
+            slices = [s.view(partition_shape) for s in slices]
+
+            offset = 0
+            for sub_dim_size in sub_dim_sizes:
+                part_sub_dim_size = sub_dim_size // tp_degree
+                merged_chunks.append(
+                    torch.cat([s.narrow(partition_dim, offset, part_sub_dim_size) for s in slices], dim=partition_dim))
+                offset += part_sub_dim_size
+            param = torch.cat(merged_chunks, dim=partition_dim)
+            ckpt_dict[SUB_PARAM_SHAPE] = matched_sub_params_shape
         else:
             cat_dim = 1 if get_matched_pattern(parameters_with_row_parallelism, name) else 0
             # print(f"merge {name} with CAT DIM: {cat_dim}")
diff --git a/deepspeed/checkpoint/universal_checkpoint.py b/deepspeed/checkpoint/universal_checkpoint.py
index 86c8dc904b8c..064891a8bb54 100644
--- a/deepspeed/checkpoint/universal_checkpoint.py
+++ b/deepspeed/checkpoint/universal_checkpoint.py
@@ -7,7 +7,16 @@
 import re
 import torch
 import types
-from .constants import (FP32_WEIGHT_KEY, PARAM, VOCAB_TENSOR, CAT_DIM, PARAM_N_SUB_PARAMS)
+from typing import List, Tuple, Union
+from dataclasses import dataclass
+from .constants import (FP32_WEIGHT_KEY, PARAM, VOCAB_TENSOR, CAT_DIM, PARAM_N_SUB_PARAMS, SUB_PARAM_SHAPE)
+
+
+@dataclass
+class SubparamShape:
+    patterns: List[str]
+    shape: Tuple[Union[Tuple[int], int]]
+    partition_dim: int
 
 
 def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
@@ -76,12 +85,32 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
         #        print(f"{full_hp_param.shape=} {full_param_numel=} {folder=}")
         #        print(f"{dst_tensor.shape=} {dst_tensor.numel()=}{folder=}")
 
+        sub_param_shape = ckpt_dict.get(SUB_PARAM_SHAPE, None)
         # since when we do many to 1 on tp we cat sometimes on dim=0 and other times on dim=1 we have to do exactly the same in reverse
         # special case is when a single parameter is effectively a container for multiple sub parameters
         # (more details at PARAM_N_SUB_PARAMS definition)
         chunk_dim = ckpt_dict.get(CAT_DIM, 0)
         n_sub_params = ckpt_dict.get(PARAM_N_SUB_PARAMS, 1)
-        if n_sub_params > 1:
+        if sub_param_shape:
+            partition_dim = sub_param_shape.partition_dim
+            sub_dim_sizes = sub_param_shape.shape[partition_dim]
+            if not isinstance(sub_dim_sizes, tuple):
+                sub_dim_sizes = (sub_dim_sizes, )
+
+            partition_shape = [sum(d) if isinstance(d, tuple) else d for d in sub_param_shape.shape]
+            full_hp_param = full_hp_param.view(partition_shape)
+
+            offset = 0
+            merged_chunks = []
+            for sub_dim_size in sub_dim_sizes:
+                sub_params_tp_slice = full_hp_param.narrow(partition_dim,
+                                                           offset, sub_dim_size).chunk(tp_world_size,
+                                                                                       dim=partition_dim)[tp_rank]
+                merged_chunks.append(sub_params_tp_slice)
+                offset += sub_dim_size
+            tp_hp_slice = torch.cat(merged_chunks, dim=partition_dim)
+
+        elif n_sub_params > 1:
             sub_params = full_hp_param.chunk(n_sub_params, dim=chunk_dim)
             sub_params_tp_slice = [p.chunk(tp_world_size, dim=chunk_dim)[tp_rank] for p in sub_params]
             tp_hp_slice = torch.cat(sub_params_tp_slice, dim=chunk_dim)

From f69f8840fc62e6cbbbad9be4216729158611127e Mon Sep 17 00:00:00 2001
From: Raza Sikander <54884406+raza-sikander@users.noreply.github.com>
Date: Sun, 14 Apr 2024 02:11:14 +0530
Subject: [PATCH 11/36] Removal of cuda hardcoded string with get_device
 function (#5351)

In UTs removed 'cuda' string hardcode by replacing with device variable
set to get_accelerator().device_name()

Co-authored-by: Shaik Raza Sikander <srsikander@habana.ai>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 tests/unit/hybrid_engine/test_he_lora.py      |  7 +++++--
 tests/unit/inference/test_stable_diffusion.py |  6 +++---
 .../transformer/inference/test_attention.py   | 20 +++++++++----------
 .../ops/transformer/inference/test_gelu.py    |  6 ++++--
 .../transformer/inference/test_layer_norm.py  | 13 ++++++------
 .../ops/transformer/inference/test_softmax.py |  3 ++-
 6 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/tests/unit/hybrid_engine/test_he_lora.py b/tests/unit/hybrid_engine/test_he_lora.py
index ea27239ed55e..5f53a237c340 100644
--- a/tests/unit/hybrid_engine/test_he_lora.py
+++ b/tests/unit/hybrid_engine/test_he_lora.py
@@ -15,6 +15,7 @@
 import numpy.testing as npt
 from unit.common import DistributedTest
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.accelerator import get_accelerator
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
@@ -125,7 +126,8 @@ def get_model(self, model_name):
         model_config.dropout = 0.0
         model = AutoModelForCausalLM.from_pretrained(model_name, config=model_config)
         model = model.half()
-        model = model.to(f'cuda:{local_rank}')
+        device = get_accelerator().device_name()
+        model = model.to(f'{device}:{local_rank}')
         return model
 
     def get_tokenizer(self, model_name):
@@ -190,7 +192,8 @@ def test_lora(self, batch_size, model_name, zero_stage, offload_device):
 
         model.train()
         batch = tokenizer(train_sentences, max_length=16, padding="max_length", truncation=True, return_tensors="pt")
-        batch = to_device(batch, f'cuda:{local_rank}')
+        device = get_accelerator().device_name()
+        batch = to_device(batch, f'{device}:{local_rank}')
         batch["labels"] = batch["input_ids"]
         outputs = model(**batch, use_cache=False)
         loss = outputs.loss
diff --git a/tests/unit/inference/test_stable_diffusion.py b/tests/unit/inference/test_stable_diffusion.py
index ac39b7ab12fa..775a02c2e878 100644
--- a/tests/unit/inference/test_stable_diffusion.py
+++ b/tests/unit/inference/test_stable_diffusion.py
@@ -20,14 +20,14 @@ class TestStableDiffusion(DistributedTest):
     def test(self):
         from diffusers import DiffusionPipeline
         from image_similarity_measures.quality_metrics import rmse
-        generator = torch.Generator(device=get_accelerator().current_device())
+        dev = get_accelerator().device_name()
+        generator = torch.Generator(device=dev)
         seed = 0xABEDABE7
         generator.manual_seed(seed)
         prompt = "a dog on a rocket"
         model = "prompthero/midjourney-v4-diffusion"
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
-        device = torch.device(f"cuda:{local_rank}")
-
+        device = torch.device(f"{dev}:{local_rank}")
         pipe = DiffusionPipeline.from_pretrained(model, torch_dtype=torch.half)
         pipe = pipe.to(device)
         baseline_image = pipe(prompt, guidance_scale=7.5, generator=generator).images[0]
diff --git a/tests/unit/ops/transformer/inference/test_attention.py b/tests/unit/ops/transformer/inference/test_attention.py
index 13abe8b915c7..0521245bcdf3 100644
--- a/tests/unit/ops/transformer/inference/test_attention.py
+++ b/tests/unit/ops/transformer/inference/test_attention.py
@@ -31,23 +31,23 @@ def test_attention(BATCH, H, N_CTX, D_HEAD, causal, use_flash, dtype=torch.float
         pytest.skip("triton has to be installed for the test")
 
     minus_inf = -65504.0
-
+    dev = deepspeed.accelerator.get_accelerator().device_name()
     # skip autotune in testing
     from deepspeed.ops.transformer.inference.triton.matmul_ext import fp16_matmul
     fp16_matmul.skip_autotune()
 
     from deepspeed.ops.transformer.inference.triton.attention import _triton_attention, _triton_packed_flash
     torch.manual_seed(20)
-    q = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5)
-    k = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5)
-    v = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5)
+    q = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device=dev).normal_(mean=0, std=.5)
+    k = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device=dev).normal_(mean=0, std=.5)
+    v = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device=dev).normal_(mean=0, std=.5)
     sm_scale = 0.3
 
     # reference implementation
     p = torch.matmul(q, k.transpose(2, 3)) * sm_scale
     score = p
-    mask = torch.zeros((BATCH, H, N_CTX, N_CTX), dtype=dtype, device="cuda")
-    M = torch.tril(torch.ones((N_CTX, N_CTX), device="cuda"))
+    mask = torch.zeros((BATCH, H, N_CTX, N_CTX), dtype=dtype, device=dev)
+    M = torch.tril(torch.ones((N_CTX, N_CTX), device=dev))
     if causal:
         for z in range(BATCH):
             for h in range(H):
@@ -58,7 +58,7 @@ def test_attention(BATCH, H, N_CTX, D_HEAD, causal, use_flash, dtype=torch.float
     context = ref_out
 
     # adjust it to expected tensor format and run test
-    qkv = torch.randn((BATCH, N_CTX, 3 * H * D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+    qkv = torch.randn((BATCH, N_CTX, 3 * H * D_HEAD), dtype=dtype, device=dev, requires_grad=False)
     qkv[:, :, :H * D_HEAD] = q.permute(0, 2, 1, 3).contiguous().reshape((BATCH, N_CTX, H * D_HEAD))
     qkv[:, :, 1 * H * D_HEAD:2 * H * D_HEAD] = k.permute(0, 2, 1, 3).contiguous().reshape((BATCH, N_CTX, H * D_HEAD))
     qkv[:, :, 2 * H * D_HEAD:] = v.permute(0, 2, 1, 3).contiguous().reshape((BATCH, N_CTX, H * D_HEAD))
@@ -66,12 +66,12 @@ def test_attention(BATCH, H, N_CTX, D_HEAD, causal, use_flash, dtype=torch.float
     if use_flash:
         if not get_accelerator().is_triton_supported():
             pytest.skip("triton flash attention is supported when the compute capability > 8.0")
-        triton_mask = torch.zeros((BATCH, 1, 1, N_CTX), dtype=dtype, device="cuda")
+        triton_mask = torch.zeros((BATCH, 1, 1, N_CTX), dtype=dtype, device=dev)
         if not causal:
-            lengths = torch.randint(N_CTX - 8, N_CTX, (BATCH, 1), device='cuda')
+            lengths = torch.randint(N_CTX - 8, N_CTX, (BATCH, 1), device=dev)
             for i, l in enumerate(lengths):
                 triton_mask[i, ..., l:] = minus_inf
-            mask = torch.zeros((BATCH, H, N_CTX, N_CTX), dtype=dtype, device="cuda")
+            mask = torch.zeros((BATCH, H, N_CTX, N_CTX), dtype=dtype, device=dev)
             for b in range(BATCH):
                 mask[b, :, :, lengths[b]:] = minus_inf
             ref_out = ref_torch_attention(q, k, v, mask, sm_scale)
diff --git a/tests/unit/ops/transformer/inference/test_gelu.py b/tests/unit/ops/transformer/inference/test_gelu.py
index de924848bfb4..675860b00bdb 100644
--- a/tests/unit/ops/transformer/inference/test_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_gelu.py
@@ -42,8 +42,9 @@ def run_gelu_ds(activations, use_triton_ops=False):
         from deepspeed.ops.transformer.inference.triton import gelu
         return gelu(activations)
 
+    device = deepspeed.accelerator.get_accelerator().device_name()
     channels = activations.shape[-1]
-    bias = torch.zeros((channels), dtype=activations.dtype, device='cuda')
+    bias = torch.zeros((channels), dtype=activations.dtype, device=device)
     global inference_module
     if inference_module is None:
         inference_module = InferenceBuilder().load()
@@ -60,7 +61,8 @@ def run_gelu_ds(activations, use_triton_ops=False):
 @pytest.mark.parametrize("dtype", [torch.float16])
 @pytest.mark.parametrize("use_triton_ops", [True, False])
 def test_gelu(batch, sequence, channels, dtype, use_triton_ops):
-    activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device='cuda')
+    device = deepspeed.accelerator.get_accelerator().device_name()
+    activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=device)
     activations_ref = activations_ds.clone().detach()
 
     if not deepspeed.HAS_TRITON and use_triton_ops:
diff --git a/tests/unit/ops/transformer/inference/test_layer_norm.py b/tests/unit/ops/transformer/inference/test_layer_norm.py
index 711a35213015..9eac612aa29c 100644
--- a/tests/unit/ops/transformer/inference/test_layer_norm.py
+++ b/tests/unit/ops/transformer/inference/test_layer_norm.py
@@ -175,19 +175,20 @@ def test_layer_norm_residual_store_pre_ln_res(batch, seq_len, channels, dtype):
 def test_triton_layer_norm(M, N, dtype, residual, input_bias, eps=1e-5, device='cuda'):
     if not deepspeed.HAS_TRITON:
         pytest.skip("triton has to be installed for the test")
+    dev = get_accelerator().device_name()
     torch.manual_seed(0)
     # create data
     x_shape = (M, N)
     w_shape = (x_shape[-1], )
-    weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=False)
-    bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=False)
-    x_bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=False)
-    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')
+    weight = torch.rand(w_shape, dtype=dtype, device=dev, requires_grad=False)
+    bias = torch.rand(w_shape, dtype=dtype, device=dev, requires_grad=False)
+    x_bias = torch.rand(w_shape, dtype=dtype, device=dev, requires_grad=False)
+    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device=dev)
     dy = .1 * torch.randn_like(x)
     if residual:
-        res = torch.rand(x_shape, dtype=dtype, device='cuda', requires_grad=False)
+        res = torch.rand(x_shape, dtype=dtype, device=dev, requires_grad=False)
     else:
-        res = torch.zeros(x_shape, dtype=dtype, device='cuda', requires_grad=False)
+        res = torch.zeros(x_shape, dtype=dtype, device=dev, requires_grad=False)
     x.requires_grad_(True)
     # forward pass
     if residual or input_bias:
diff --git a/tests/unit/ops/transformer/inference/test_softmax.py b/tests/unit/ops/transformer/inference/test_softmax.py
index 76046f31e01a..7d0d6e14b651 100644
--- a/tests/unit/ops/transformer/inference/test_softmax.py
+++ b/tests/unit/ops/transformer/inference/test_softmax.py
@@ -43,7 +43,8 @@ def run_softmax_ds(input, use_triton_ops=False):
 def test_softmax(batch, sequence, channels, dtype, use_triton_ops):
     if not deepspeed.HAS_TRITON and use_triton_ops:
         pytest.skip("triton has to be installed for the test")
-    input_ds = torch.randn((batch, sequence, channels), dtype=dtype, device='cuda')
+    device = deepspeed.accelerator.get_accelerator().device_name()
+    input_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=device)
     input_ref = input_ds.clone().detach()
 
     ds_out = run_softmax_ds(input_ds, use_triton_ops)

From 7b5b06602d5941cf7ea6170062d3f81c9002d788 Mon Sep 17 00:00:00 2001
From: Guanhua Wang <alexwgh333@gmail.com>
Date: Sun, 14 Apr 2024 11:26:19 -0700
Subject: [PATCH 12/36] fix pagable h2d memcpy (#5301)

ZeRO offload case

Fix the issue of pageble h2d memcpy in step process. Now h2d memcpy uses
pinned memory.

Speedup h2d memcpy by 6x on single GPU and 4-5x on 8GPU node.

cc @tjruwase

---------

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Ubuntu <deepspeed@deepspeed-login.2d1icxc5dsxehnpuwt3ifc34ph.gvxx.internal.cloudapp.net>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/runtime/zero/stage_1_and_2.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index 16b9c3c18919..225c085f6f2b 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -390,11 +390,16 @@ def __init__(self,
             # Note that the params in single_partition_of_fp32_groups is cloned and detached
             # from the origin params of the model.
             if not fp16_master_weights_and_gradients:
-                self.single_partition_of_fp32_groups.append(self.parallel_partitioned_bit16_groups[i][partition_id].to(
-                    self.device).clone().float().detach())
+                weights_partition = self.parallel_partitioned_bit16_groups[i][partition_id].to(
+                    self.device).clone().float().detach()
             else:
-                self.single_partition_of_fp32_groups.append(self.parallel_partitioned_bit16_groups[i][partition_id].to(
-                    self.device).clone().half().detach())
+                weights_partition = self.parallel_partitioned_bit16_groups[i][partition_id].to(
+                    self.device).clone().half().detach()
+
+            if self.cpu_offload:
+                weights_partition = get_accelerator().pin_memory(weights_partition)
+
+            self.single_partition_of_fp32_groups.append(weights_partition)
 
             # Set local optimizer to have flat params of its own partition.
             # After this, the local optimizer will only contain its own partition of params.
@@ -1862,7 +1867,8 @@ def step(self, closure=None):
                 #    bit16_partitions[partition_id].data.copy_(fp32_partition.data)
                 bit16_partitions = self.parallel_partitioned_bit16_groups[i]
                 fp32_partition = self.single_partition_of_fp32_groups[i]
-                bit16_partitions[partition_id].data.copy_(fp32_partition.data)
+                bit16_partitions[partition_id].data.copy_(
+                    fp32_partition.to(get_accelerator().current_device_name()).data)
 
                 self.timers(OPTIMIZER_STEP_TIMER).stop()
             else:

From 54c06872647ca60699f752e60ac1643bd05aa63c Mon Sep 17 00:00:00 2001
From: Nadav Elyahu <88962733+nelyahu@users.noreply.github.com>
Date: Sun, 14 Apr 2024 22:40:52 +0300
Subject: [PATCH 13/36] stage3: efficient compute of scaled_global_grad_norm
 (#5256)

using torch.norm instead of inefficient for loop

---------

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/runtime/zero/stage3.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index c6ff216edfcb..68cab13c4a93 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -15,7 +15,7 @@
 from deepspeed.utils import logger
 from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
 from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce
-from deepspeed.runtime.utils import inf, get_global_norm, is_model_parallel_parameter, get_only_unique_item
+from deepspeed.runtime.utils import inf, is_model_parallel_parameter, get_only_unique_item
 from deepspeed.runtime.zero.partition_parameters import *
 from deepspeed.runtime.zero.config import ZeroStageEnum
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
@@ -2027,7 +2027,7 @@ def step(self, closure=None):
             return
 
         norm_groups = self._get_norm_groups()
-        scaled_global_grad_norm = get_global_norm(norm_list=norm_groups)
+        scaled_global_grad_norm = torch.norm(torch.stack(norm_groups))
 
         # Stash unscaled gradient norm
         self._global_grad_norm = scaled_global_grad_norm / self.loss_scale

From e3d873a00ec2a78f74796f9a1a0d571f310e5f87 Mon Sep 17 00:00:00 2001
From: "ZHENG, Zhen" <zhengzhen.z@qq.com>
Date: Tue, 16 Apr 2024 01:23:28 +0800
Subject: [PATCH 14/36] Fix the FP6 kernels compilation problem on non-Ampere
 GPUs. (#5333)

Refine the guards of FP6 kernel compilation. Fix the `undefined symbol`
problem of FP6 kernels on non-Ampere architectures.

Related issue: https://github.com/microsoft/DeepSpeed-MII/issues/443.

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Michael Wyatt <michaelwyatt@microsoft.com>
---
 .../v2/kernels/core_ops/core_ops.cpp          |  4 +--
 .../cuda_linear/include/kernel_matmul.cuh     | 11 ++++++++
 .../cuda_linear/include/kernel_reduction.cuh  |  5 ++++
 .../cuda_linear/include/ptx_cp.async.cuh      | 25 ++++++++++++++++---
 .../core_ops/cuda_linear/include/ptx_mma.cuh  | 16 ++++++++++--
 .../cuda_linear/include/utils_core.cuh        |  4 +--
 .../cuda_linear/include/utils_gmem.cuh        |  4 +--
 .../include/utils_paralleldequant.cuh         |  4 +--
 .../cuda_linear/include/weight_prepacking.h   |  5 ++++
 ..._linear_kernels.cpp => linear_kernels.cpp} |  2 +-
 ...cuda_linear_kernels.h => linear_kernels.h} |  7 ++++--
 .../{fp6_linear.cu => linear_kernels_cuda.cu} | 11 +++++---
 .../{fp6_linear.cuh => linear_kernels_cuda.h} |  5 ++++
 deepspeed/inference/v2/modules/heuristics.py  | 11 +++++++-
 op_builder/inference_core_ops.py              | 13 ++--------
 .../modules/test_quantized_linear_module.py   |  3 +--
 16 files changed, 96 insertions(+), 34 deletions(-)
 rename deepspeed/inference/v2/kernels/core_ops/cuda_linear/{cuda_linear_kernels.cpp => linear_kernels.cpp} (99%)
 rename deepspeed/inference/v2/kernels/core_ops/cuda_linear/{cuda_linear_kernels.h => linear_kernels.h} (85%)
 rename deepspeed/inference/v2/kernels/core_ops/cuda_linear/{fp6_linear.cu => linear_kernels_cuda.cu} (98%)
 rename deepspeed/inference/v2/kernels/core_ops/cuda_linear/{fp6_linear.cuh => linear_kernels_cuda.h} (94%)

diff --git a/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp b/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp
index 2397b0694696..3f36a6bf01cb 100644
--- a/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp
+++ b/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp
@@ -8,9 +8,9 @@
 
 #include "bias_activation.h"
 #include "blas.h"
-#include "cuda_linear_kernels.h"
 #include "gated_activation_kernels.h"
 #include "layer_norm.h"
+#include "linear_kernels.h"
 #include "rms_norm.h"
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
@@ -35,7 +35,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("rms_norm", &rms_norm, "DeepSpeed rms norm in CUDA");
     m.def("rms_pre_norm", &rms_pre_norm, "DeepSpeed rms pre norm in CUDA");
 
-    // cuda_linear_kernels.h
+    // linear_kernels.h
     m.def("cuda_wf6af16_linear", &cuda_wf6af16_linear, "DeepSpeed Wf6Af16 linear in CUDA");
     m.def(
         "preprocess_weight", &preprocess_weight, "preprocess the FP16 weight to be 2bit and 4 bit");
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh
index aa6ea6c4b1c2..0262baef4614 100644
--- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh
@@ -5,6 +5,9 @@
 
 // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
 
+#ifndef DEEPSPEED_CUDA_LINEAR_KERNEL_MATMUL_CUH
+#define DEEPSPEED_CUDA_LINEAR_KERNEL_MATMUL_CUH
+
 #include "configs.h"
 #include "utils_core.cuh"
 #include "utils_gmem.cuh"
@@ -26,6 +29,8 @@ __global__ void QUANT_GEMM_Kernel(const uint4* Weight1,
                                   const size_t K_Global,
                                   int Split_K)
 {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 900
+
 #ifdef DEBUG_MODE
     assert(K_Global % TilingConfig::TILE_K == 0);
     assert(M_Global % TilingConfig::TILE_M == 0);
@@ -258,4 +263,10 @@ __global__ void QUANT_GEMM_Kernel(const uint4* Weight1,
             else
                 BlockGlobalPTR[j + i * M_Global] = smem_CFrag[i][j];
         }
+
+#else
+#warning "The FP6 functions are only available on Ampere GPUs."
+#endif
 }
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh
index 8c49f8b0b3a5..c417e6a46a7c 100644
--- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh
@@ -5,6 +5,9 @@
 
 // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
 
+#ifndef DEEPSPEED_CUDA_LINEAR_KERNEL_REDUCTION_CUH
+#define DEEPSPEED_CUDA_LINEAR_KERNEL_REDUCTION_CUH
+
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
@@ -36,3 +39,5 @@ __global__ void SplitK_Reduction(half* C,
 #pragma unroll
     for (int i = 0; i < HALF_PER_128BIT; i++) THREAD_GPTR_C[i] = __float2half_rn(Results[i]);
 }
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh
index 7f36cfd5d961..39874e023539 100644
--- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh
@@ -5,8 +5,8 @@
 
 // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
 
-#ifndef PTX_CP_ASYNC_CUH
-#define PTX_CP_ASYNC_CUH
+#ifndef DEEPSPEED_CUDA_LINEAR_PTX_CP_ASYNC_CUH
+#define DEEPSPEED_CUDA_LINEAR_PTX_CP_ASYNC_CUH
 
 #include <cuda.h>
 #include <cuda_fp16.h>
@@ -17,6 +17,7 @@ __device__ __forceinline__ void cp_async(half* smem_ptr,
                                          const half* global_ptr,
                                          bool pred_guard = true)
 {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
     static_assert(SizeInBytes == 16, "Size is not supported");
     unsigned smem_int_ptr = __cvta_generic_to_shared(smem_ptr);
     asm volatile(
@@ -28,25 +29,43 @@ __device__ __forceinline__ void cp_async(half* smem_ptr,
         "r"(smem_int_ptr),
         "l"(global_ptr),
         "n"(SizeInBytes));
+#else
+#warning "The async copy functions are only supported on Ampere and newer architectures"
+#endif
 }
 
 /// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block.
 __device__ __forceinline__ void cp_async_group_commit()
 {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
     asm volatile("cp.async.commit_group;\n" ::);
+#else
+#warning "The async copy functions are only supported on Ampere and newer architectures"
+#endif
 }
 
 /// Blocks until all but <N> previous cp.async.commit_group operations have committed.
 template <int N>
 __device__ __forceinline__ void cp_async_wait_group()
 {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
     asm volatile("cp.async.wait_group %0;\n" ::"n"(N));
+#else
+#warning "The async copy functions are only supported on Ampere and newer architectures"
+#endif
 }
 
 /// Blocks until all previous cp.async.commit_group operations have committed.
 // cp.async.wait_all is equivalent to :
 // cp.async.commit_group;
 // cp.async.wait_group 0;
-__device__ __forceinline__ void cp_async_wait_all() { asm volatile("cp.async.wait_all;\n" ::); }
+__device__ __forceinline__ void cp_async_wait_all()
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    asm volatile("cp.async.wait_all;\n" ::);
+#else
+#warning "The async copy functions are only supported on Ampere and newer architectures"
+#endif
+}
 
 #endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh
index f13abe036279..8023629caac9 100644
--- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh
@@ -5,8 +5,8 @@
 
 // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
 
-#ifndef PTX_MMA_CUH
-#define PTX_MMA_CUH
+#ifndef DEEPSPEED_CUDA_LINEAR_PTX_MMA_CUH
+#define DEEPSPEED_CUDA_LINEAR_PTX_MMA_CUH
 
 #include <cuda.h>
 #include <cuda_fp16.h>
@@ -22,6 +22,7 @@ __device__ __forceinline__ void B_FromSharedToReg(
     half __restrict__ (*read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
     int slice_id)
 {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
 #ifdef DEBUG_MODE
     static_assert((TilingConfig::WARP_COL_MMA_TENSORS == 1) ||
                   (TilingConfig::WARP_COL_MMA_TENSORS % 2 == 0));
@@ -54,6 +55,9 @@ __device__ __forceinline__ void B_FromSharedToReg(
             smem_local_ptr += 16 * (WARP_K + PADDING_SHARED_MEM_FOR_B_8) * sizeof(half);
         }
     }
+#else
+#warning "The matrix load functions are only supported on Ampere and newer architectures"
+#endif
 }
 #else
 // Debug: Whether ldmatrix.trans is required???
@@ -64,6 +68,7 @@ __device__ __forceinline__ void B_FromSharedToReg(
     half __restrict__ (*read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
     int k_offset)
 {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
 #ifdef DEBUG_MODE
     static_assert((TilingConfig::WARP_COL_MMA_TENSORS == 1) ||
                   (TilingConfig::WARP_COL_MMA_TENSORS % 2 == 0));
@@ -96,6 +101,9 @@ __device__ __forceinline__ void B_FromSharedToReg(
             smem_local_ptr += 16 * (WARP_K + PADDING_SHARED_MEM_FOR_B_8) * sizeof(half);
         }
     }
+#else
+#warning "The matrix load functions are only supported on Ampere and newer architectures"
+#endif
 }
 #endif
 
@@ -103,6 +111,7 @@ __device__ __forceinline__ void MMA_FP16_M16N8K16(uint32_t __restrict__ c[],
                                                   uint32_t __restrict__* a,
                                                   uint32_t __restrict__* b)
 {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
     asm volatile(
         "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
         "{ %0, %1, %2, %3},"
@@ -120,6 +129,9 @@ __device__ __forceinline__ void MMA_FP16_M16N8K16(uint32_t __restrict__ c[],
           "r"(c[1]),
           "r"(c[2]),
           "r"(c[3]));
+#else
+#warning "The mma functions are only implemented for Ampere and newer architectures"
+#endif
 }
 
 #endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh
index 713cebc57e33..a65575a1ba5a 100644
--- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh
@@ -5,8 +5,8 @@
 
 // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
 
-#ifndef UTILS_CORE_CUH
-#define UTILS_CORE_CUH
+#ifndef DEEPSPEED_CUDA_LINEAR_UTILS_CORE_CUH
+#define DEEPSPEED_CUDA_LINEAR_UTILS_CORE_CUH
 
 #include <assert.h>
 
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh
index 62b77edaa37a..d0c58352cd56 100644
--- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh
@@ -5,8 +5,8 @@
 
 // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
 
-#ifndef UTILS_GMEM_CUH
-#define UTILS_GMEM_CUH
+#ifndef DEEPSPEED_CUDA_LINEAR_UTILS_GMEM_CUH
+#define DEEPSPEED_CUDA_LINEAR_UTILS_GMEM_CUH
 
 #include <assert.h>
 #include "configs.h"
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh
index ff13868c1347..9f035f12cfcd 100644
--- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh
@@ -5,8 +5,8 @@
 
 // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
 
-#ifndef UTILS_PARALLELDEQUANT_CUH
-#define UTILS_PARALLELDEQUANT_CUH
+#ifndef DEEPSPEED_CUDA_LINEAR_UTILS_PARALLELDEQUANT_CUH
+#define DEEPSPEED_CUDA_LINEAR_UTILS_PARALLELDEQUANT_CUH
 
 #include <cuda.h>
 #include <cuda_fp16.h>
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h
index c8cc7243f341..98805e6c111c 100644
--- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h
@@ -5,6 +5,9 @@
 
 // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
 
+#ifndef DEEPSPEED_CUDA_LINEAR_WEIGHT_PREPACKING_H
+#define DEEPSPEED_CUDA_LINEAR_WEIGHT_PREPACKING_H
+
 #include <assert.h>
 #include <stdio.h>
 #include <vector>
@@ -202,3 +205,5 @@ void weight_matrix_prepacking(int* FP6Weights, size_t M, size_t K)
     for (size_t i = 0; i < BytesPerThread_4bit * 32 / 4; i++)
         BitInterleaving_4bit(Weight_4bit + 4 * i);
 }
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.cpp
similarity index 99%
rename from deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp
rename to deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.cpp
index 677bec22ded8..3b4966eb822b 100644
--- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.cpp
@@ -5,7 +5,7 @@
 
 #include <ATen/cuda/CUDAContext.h>
 
-#include "cuda_linear_kernels.h"
+#include "linear_kernels.h"
 
 namespace {
 
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.h b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.h
similarity index 85%
rename from deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.h
rename to deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.h
index 0f5882d519ca..01a6b7c18af8 100644
--- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.h
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.h
@@ -3,13 +3,14 @@
 
 // DeepSpeed Team
 
-#pragma once
+#ifndef DEEPSPEED_CUDA_LINEAR_KERNELS_H
+#define DEEPSPEED_CUDA_LINEAR_KERNELS_H
 
 #include <c10/cuda/CUDAStream.h>
 #include <torch/extension.h>
 #include "ds_kernel_utils.h"
 
-#include "fp6_linear.cuh"
+#include "linear_kernels_cuda.h"
 
 void cuda_wf6af16_linear(torch::Tensor& output,
                          torch::Tensor& hidden_states,
@@ -23,3 +24,5 @@ void cuda_wf6af16_linear(torch::Tensor& output,
                          int split_k);
 
 std::vector<torch::Tensor> preprocess_weight(torch::Tensor& Weight);
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.cu
similarity index 98%
rename from deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu
rename to deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.cu
index 64e06a5435c6..ea0203c42f84 100644
--- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.cu
@@ -19,6 +19,8 @@
 #include <assert.h>
 #include <stdio.h>
 
+#include "linear_kernels_cuda.h"
+
 template <typename TilingConfig, typename OutputDataType>
 static void Kernel_Ex(cudaStream_t stream,
                       const uint4* Weight1,
@@ -50,7 +52,7 @@ static void Kernel_Ex(cudaStream_t stream,
     size_t dimM = M_Global * Split_K / TilingConfig::TILE_M;
     dim3 GridDim(dimN, dimM, 1);
     dim3 BlockDim(WARP_SIZE * TilingConfig::BLOCK_WARPS, 1, 1);
-//
+
 #ifdef DEBUG_MODE
     printf(
         "GridDim.x: %d, GridDim.y: %d, GridDim.z: %d, BlockDim.x: %d, BlockDim.y: %d, BlockDim.z: "
@@ -64,6 +66,7 @@ static void Kernel_Ex(cudaStream_t stream,
         SHMEM_SZ);
     printf("\n");
 #endif
+
     QUANT_GEMM_Kernel<TilingConfig, OutputDataType><<<GridDim, BlockDim, SHMEM_SZ, stream>>>(
         Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K);
 }
@@ -121,7 +124,7 @@ cudaError_t fp6_linear_kernel(cudaStream_t stream,
                 break;
             default:
                 if (N_PowerOf2 % 128 != 0) {
-                    printf("QuantLLM_API Error: Unsupported N dimension %d!\n", N_PowerOf2);
+                    printf("QuantLLM_API Error: Unsupported N dimension %lu!\n", N_PowerOf2);
                     return cudaErrorUnknown;
                 }
                 Kernel_Ex<TilingConfig<4, 1, 8>, half>(
@@ -192,7 +195,7 @@ cudaError_t fp6_linear_kernel(cudaStream_t stream,
                 break;
             default:
                 if (N_PowerOf2 % 128 != 0) {
-                    printf("QuantLLM_API Error: Unsupported N dimension %d!\n", N_PowerOf2);
+                    printf("QuantLLM_API Error: Unsupported N dimension %lu!\n", N_PowerOf2);
                     return cudaErrorUnknown;
                 }
                 Kernel_Ex<TilingConfig<4, 1, 8>, float>(stream,
@@ -236,7 +239,7 @@ calling our CUDA kernel.
 torch::Tensor fp6_linear_forward_cuda(torch::Tensor _in_feats,
                                       torch::Tensor _weights,
                                       torch::Tensor _scales,
-                                      int splitK = 1)
+                                      int splitK)
 {
     int num_in_feats = _in_feats.size(0);
     int num_in_channels = _in_feats.size(1);
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.h
similarity index 94%
rename from deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cuh
rename to deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.h
index 95f7f6050c15..6a83290f0cb5 100644
--- a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cuh
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.h
@@ -5,6 +5,9 @@
 
 // This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
 
+#ifndef DEEPSPEED_CUDA_LINEAR_FP6_LINEAR_CUH
+#define DEEPSPEED_CUDA_LINEAR_FP6_LINEAR_CUH
+
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
@@ -44,3 +47,5 @@ void weight_matrix_prepacking(int* FP6Weights, size_t M, size_t K);
  * Weight prepacking (Pytorch interface).
  */
 torch::Tensor weight_matrix_prepacking_cpu(torch::Tensor fp6_tensor, size_t M, size_t K);
+
+#endif
diff --git a/deepspeed/inference/v2/modules/heuristics.py b/deepspeed/inference/v2/modules/heuristics.py
index d176206f3c60..f719e299a4b2 100644
--- a/deepspeed/inference/v2/modules/heuristics.py
+++ b/deepspeed/inference/v2/modules/heuristics.py
@@ -90,8 +90,17 @@ def instantiate_linear(linear_config: DSLinearConfig, engine_config: RaggedInfer
     if quantization_mode is None:
         config = ConfigBundle(name="blas_fp_linear", config=linear_config)
     else:
-        # Currently, we only support ``quantized_wf6af16_linear``.
+        # Currently, we only support ``quantized_wf6af16_linear`` on NVIDIA Ampere GPUs.
         if quantization_mode == "wf6af16":
+            import torch
+            if not torch.cuda.is_available():  #ignore-cuda
+                raise ValueError("WF6AF16 quantization is only supported on CUDA")
+            else:
+                is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
+                if is_rocm_pytorch:
+                    raise ValueError("WF6AF16 quantization is only supported on NVIDIA GPUs")
+                elif torch.cuda.get_device_properties(0).major != 8:  #ignore-cuda
+                    raise ValueError("WF6AF16 quantization is only supported on Ampere architectures")
             config = ConfigBundle(name="quantized_wf6af16_linear", config=linear_config)
         else:
             raise ValueError(f"Unsupported quantization mode: {quantization_mode}")
diff --git a/op_builder/inference_core_ops.py b/op_builder/inference_core_ops.py
index 3c53774d0a50..d1957f39d9a8 100755
--- a/op_builder/inference_core_ops.py
+++ b/op_builder/inference_core_ops.py
@@ -57,8 +57,6 @@ def get_prefix(self):
         return "deepspeed" if os.path.isdir(ds_path) else ".."
 
     def sources(self):
-        import torch
-
         sources = [
             "inference/v2/kernels/core_ops/core_ops.cpp",
             "inference/v2/kernels/core_ops/bias_activations/bias_activation.cpp",
@@ -69,17 +67,10 @@ def sources(self):
             "inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm_cuda.cu",
             "inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cpp",
             "inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu",
+            "inference/v2/kernels/core_ops/cuda_linear/linear_kernels.cpp",
+            "inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.cu",
         ]
 
-        # The source files with specific GPU architecture requirements.
-        if not self.is_rocm_pytorch() and torch.cuda.is_available():  #ignore-cuda
-            cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
-            if cuda_capability != 8:
-                self.warning("FP6 quantization kernel is only supported on Ampere architectures")
-            else:
-                sources.append("inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu")
-                sources.append("inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp")
-
         prefix = self.get_prefix()
         sources = [os.path.join(prefix, src) for src in sources]
         return sources
diff --git a/tests/unit/inference/v2/modules/test_quantized_linear_module.py b/tests/unit/inference/v2/modules/test_quantized_linear_module.py
index a7bd965072ac..050f21c3bf3a 100644
--- a/tests/unit/inference/v2/modules/test_quantized_linear_module.py
+++ b/tests/unit/inference/v2/modules/test_quantized_linear_module.py
@@ -145,10 +145,9 @@ def _fp6_quantized_linear_helper(tokens: int,
     ActivationType.ReGLU,
     ActivationType.SiGLU,
 ]
-all_tokens = [1, 37]
+all_tokens = [37]
 all_in_out_channels = [
     (4096, 4096),
-    (8192, 28672),
 ]
 
 
From 2b3d31f9804f024f0cbb036771c35a5bdd9ef848 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Mon, 15 Apr 2024 12:53:41 -0700
Subject: [PATCH 15/36] Update version.txt after 0.14.1 release (#5413)

**Auto-generated PR to update version.txt after a DeepSpeed release**
Released version - 0.14.1
Author           - @loadams

Co-authored-by: loadams <loadams@users.noreply.github.com>
---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 930e3000bdc9..e867cc2a66a8 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.14.1
+0.14.2

From 8949105369ccaacecc7b707601b787e4af730e89 Mon Sep 17 00:00:00 2001
From: Raza Sikander <54884406+raza-sikander@users.noreply.github.com>
Date: Tue, 16 Apr 2024 01:28:56 +0530
Subject: [PATCH 16/36] Remove dtype(fp16) condition check for residual_add
 unit test (#5329)

When the dtype is bf16 or fp32 the if condition is not satisfied and it
continues execution instead of skipping when triton is not installed.

Co-authored-by: Shaik Raza Sikander <srsikander@habana.ai>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 tests/unit/ops/transformer/inference/test_residual_add.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/ops/transformer/inference/test_residual_add.py b/tests/unit/ops/transformer/inference/test_residual_add.py
index c2952f74ff2d..91830e25fc81 100644
--- a/tests/unit/ops/transformer/inference/test_residual_add.py
+++ b/tests/unit/ops/transformer/inference/test_residual_add.py
@@ -77,7 +77,7 @@ def run_residual_add_reference(hidden_state, residual, attn_output, attn_bias, f
 @pytest.mark.parametrize("use_triton_ops", [True, False])
 def test_residual_add(inference_module, batch, sequence, hidden_dim, dtype, mlp_after_attn, add_bias, mp_size,
                       pre_attn_norm, use_triton_ops):
-    if not deepspeed.HAS_TRITON and use_triton_ops and dtype == torch.float16:
+    if not deepspeed.HAS_TRITON and use_triton_ops:
         pytest.skip("triton has to be installed for the test")
     ds_out = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
     residual = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name())

From 258e500f3fcd33fa42871ff87612d135399017ac Mon Sep 17 00:00:00 2001
From: YiSheng5 <yi.sheng@intel.com>
Date: Wed, 17 Apr 2024 01:40:45 +0800
Subject: [PATCH 17/36] [XPU] Use non_daemonic_proc by default on XPU device
 (#5412)

Set non_daemonic_proc=True by default on XPU Device, using
non_daemonic_proc for unit test.

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 tests/unit/common.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/unit/common.py b/tests/unit/common.py
index 1fd83de81f02..a2593e703aef 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -248,6 +248,10 @@ def _launch_procs(self, num_procs):
                 f"Skipping test because not enough GPUs are available: {num_procs} required, {get_accelerator().device_count()} available"
             )
 
+        if get_accelerator().device_name() == 'xpu':
+            self.non_daemonic_procs = True
+            self.reuse_dist_env = False
+
         # Set start method to `forkserver` (or `fork`)
         mp.set_start_method('forkserver', force=True)
 

From 0896503e2f4d3b12583dfe267e52db3a1d63b88d Mon Sep 17 00:00:00 2001
From: inkcherry <mingzhi.liu@intel.com>
Date: Wed, 17 Apr 2024 02:21:26 +0800
Subject: [PATCH 18/36] Fix a convergence issues in TP topology caused by
 incorrect grad_norm. (#5411)

Some users are concerned that changes in TP topology during MOE training
may potentially cause interference with experiments when noticing
similar issues
https://github.com/microsoft/Megatron-DeepSpeed/issues/151
https://github.com/microsoft/Megatron-DeepSpeed/pull/176/files

We found a grad_norm calculation error after enabling TP. This error
occurs because flattened grad of a params group is used, where the group
contains both non-TP and TP parameters. Therefore, it is not possible to
use a single attribute to determine whether flattened grad needs to
compute the norm. In the current code logic, all params are assumed to
be non-TP, resulting in only tp_rank0 grad participating in grad_norm
computation. Other tp_rank grads have grad_norm_sum equal to 0. We
tested and found that with TP=1 and TP=4, the difference in grad_norm is
approximately twice (sqrt(4)). This aligns with the aforementioned
issue. This problem should also affect dense models.

Due to the absence of flattening params_group grad in bf16, this problem
is avoided.

We tested the loss curve on the 1.3B model. In cases where TP size
increases the inconsistent gap should be larger.

with this change 1.3B with EP=4 TP=4 &1 , fp16,mbs=1,gbs=16

![image](https://github.com/microsoft/DeepSpeed/assets/27563729/855042c8-ac8a-4192-b465-5fa60c1a7c59)


without this change  1.3B with EP=4 TP=4&1  ,fp16,mbs=1,gbs=16

![image](https://github.com/microsoft/DeepSpeed/assets/27563729/66854d14-7b83-4b09-a669-b452d6157ea0)

---------

Co-authored-by: Conglong Li <conglong.li@gmail.com>
---
 deepspeed/runtime/fp16/fused_optimizer.py | 65 +++++++++++++++++++----
 deepspeed/runtime/utils.py                | 54 ++++++++-----------
 2 files changed, 77 insertions(+), 42 deletions(-)

diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py
index af8050c4646a..bf1693307ea7 100755
--- a/deepspeed/runtime/fp16/fused_optimizer.py
+++ b/deepspeed/runtime/fp16/fused_optimizer.py
@@ -9,15 +9,16 @@
 
 import torch
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-
 from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer
-from deepspeed.runtime.utils import get_global_norm, get_grad_norm, CheckOverflow, get_weight_norm, get_norm_with_moe_layers
+from deepspeed.runtime.utils import get_global_norm, get_flattened_grad_norm, CheckOverflow, get_weight_norm, get_norm_with_moe_layers, is_model_parallel_parameter
 from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE
 from deepspeed.utils import logger, log_dist
 from deepspeed.utils.torch import required_torch_version
 from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, CLIP_GRAD
 from deepspeed.accelerator import get_accelerator
 from deepspeed.moe.utils import is_moe_param_group
+from deepspeed.runtime.constants import PIPE_REPLICATED
+from deepspeed.utils.bwc import bwc_tensor_model_parallel_rank
 
 OVERFLOW_CHECK_TIMER = 'overflow_check'
 COMPUTE_NORM_TIMER = 'compute_norm'
@@ -64,6 +65,8 @@ def __init__(self,
         self.fp16_groups_flat = []
         self.fp32_groups_flat = []
 
+        self.flatten_grad_norm_mask_list = []
+        self.has_executed_step = False
         self._global_grad_norm = 0.
 
         # loop to deal with groups
@@ -206,6 +209,40 @@ def override_loss_scale(self, loss_scale):
         self.custom_loss_scaler = True
         self.external_loss_scale = loss_scale
 
+    def _require_avoid_recompute_norm(self, p, tensor_model_parallel_rank):
+        # for filtering  replicated tensors from tensor
+        if hasattr(p, PIPE_REPLICATED) and p.ds_pipe_replicated:
+            return True
+        if (tensor_model_parallel_rank > 0) and not is_model_parallel_parameter(p):
+            return True
+
+    def _get_norm_mask_idx(self, group):
+        """The function preserves the parallel information for norm
+        from unflattened gradients.
+
+        Args:
+            group (Iterable[Tensor] ): params group
+
+        Returns:
+            torch.Tensor: A 2D tensor containing index ranges for each group,
+                      where each row represents a [start index, end index].
+        """
+        group_mask_idx_list = []
+        grad_flat_st_idx = 0
+        grad_flat_en_idx = 0
+
+        for p in group:
+            grad_flat_en_idx = grad_flat_st_idx + p.numel()
+            if p.grad is not None and self._require_avoid_recompute_norm(p, bwc_tensor_model_parallel_rank(self.mpu)):
+                # merge range
+                if len(group_mask_idx_list) > 0 and grad_flat_st_idx == group_mask_idx_list[-1][-1]:
+                    group_mask_idx_list[-1][-1] = grad_flat_en_idx
+                else:
+                    group_mask_idx_list.append([grad_flat_st_idx, grad_flat_en_idx])
+            grad_flat_st_idx = grad_flat_en_idx
+
+        return torch.tensor(group_mask_idx_list, device=get_accelerator().current_device())
+
     def step(self, closure=None):
         """
         Not supporting closure.
@@ -251,23 +288,32 @@ def step(self, closure=None):
                     for p in group
                 ]))
 
-            for p in group:
-                p.grad = None
-
             self.fp32_groups_flat[i].grad = grads_groups_flat[i]
             param_group = self.optimizer.param_groups[i]
+
+            # split expert and non_expert grads for norm
             if self.has_moe_layers and is_moe_param_group(param_group):
                 if param_group['name'] not in expert_grads_for_norm:
                     expert_grads_for_norm[param_group['name']] = []
+
                 expert_grads_for_norm[param_group['name']].append(self.fp32_groups_flat[i])
             else:
+                # retrieves the required mask for calculating the norm of flat_grad
+                # perform this collect operation only once
+                if not self.has_executed_step:
+                    cur_flat_grad_norm_mask = self._get_norm_mask_idx(group)
+                    self.flatten_grad_norm_mask_list.append(cur_flat_grad_norm_mask)
+
                 non_experts_grads_for_norm.append(self.fp32_groups_flat[i])
 
-        self.timers(COMPUTE_NORM_TIMER).start()
+            for p in group:
+                p.grad = None
 
-        all_groups_norm = get_grad_norm(non_experts_grads_for_norm, mpu=self.mpu)
+        self.timers(COMPUTE_NORM_TIMER).start()
 
-        self.timers(COMPUTE_NORM_TIMER).stop()
+        all_groups_norm = get_flattened_grad_norm(non_experts_grads_for_norm,
+                                                  mpu=self.mpu,
+                                                  grad_norm_mask=self.flatten_grad_norm_mask_list)
 
         if self.has_moe_layers:
             all_groups_norm = get_norm_with_moe_layers(all_groups_norm,
@@ -276,6 +322,7 @@ def step(self, closure=None):
                                                        norm_type=self.norm_type)
 
         scaled_global_grad_norm = get_global_norm(norm_list=[all_groups_norm])
+        self.timers(COMPUTE_NORM_TIMER).stop()
 
         # Stash unscaled gradient norm
         self._global_grad_norm = scaled_global_grad_norm / self.cur_scale
@@ -298,7 +345,7 @@ def step(self, closure=None):
             updated_params = _unflatten_dense_tensors(self.fp32_groups_flat[i], self.fp16_groups[i])
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data.copy_(q.data)
-
+        self.has_executed_step = True
         self.timers(UPDATE_FP16_TIMER).stop()
 
         self.timers.log(STEP_TIMERS)
diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index c55f8a0e2995..7744b2ee8b98 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -17,7 +17,6 @@
 
 import torch
 from deepspeed import comm as dist
-
 try:
     from torch._six import inf
 except ModuleNotFoundError:
@@ -385,7 +384,7 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None):
     return total_norm
 
 
-def get_grad_norm(parameters, norm_type=2, mpu=None):
+def get_flattened_grad_norm(parameters, norm_type=2, mpu=None, grad_norm_mask=None):
     """Get grad norm of an iterable of parameters.
 
     This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
@@ -397,7 +396,8 @@ def get_grad_norm(parameters, norm_type=2, mpu=None):
             single Tensor that will have gradients normalized
         norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
             infinity norm.
-
+        grad_norm_mask (List[Tensor]): A list of Tensor, where
+            each Tensor is a 2D Tensor containing ranges of [start_index, end_index].
     Returns:
         Total norm of the parameters (viewed as a single vector).
     """
@@ -415,18 +415,25 @@ def get_grad_norm(parameters, norm_type=2, mpu=None):
         total_norm = total_norm_cuda[0].item()
     else:
         total_norm = 0.
-        tensor_mp_rank = bwc_tensor_model_parallel_rank(mpu=mpu)
-        for p in parameters:
-            # Pipeline parallelism may replicate parameters. Avoid multi-counting.
-            if hasattr(p, PIPE_REPLICATED) and p.ds_pipe_replicated:
-                continue
-
-            # Filter to avoid over-counting replicated tensors from tensor
-            # model parallelism
-            if (tensor_mp_rank > 0) and not is_model_parallel_parameter(p):
-                continue
+        for idx, p in enumerate(parameters):
+            # Use grad_norm_mask to avoid redundant computation of flattened gradient norm
+            if grad_norm_mask is not None and len(grad_norm_mask[idx]) > 0:
+
+                # A loop-free implementation to create a mask tensor based on a range list
+                # which is logically equivalent to the following implementation.
+                # # mask_tensor_ = torch.zeros_like(p, device=p.device, dtype=bool)
+                # # for mask_idx in grad_norm_mask[idx]:
+                # #   mask_tensor_[mask_idx[0]:mask_idx[1]] = True
+                cum_sum_pairs = torch.tensor([1, -1], device=get_accelerator().current_device(),
+                                             dtype=p.dtype).repeat(grad_norm_mask[idx].shape[0], 1)
+                mask_tensor = torch.zeros(p.shape[0] + 1, device=get_accelerator().current_device(), dtype=p.dtype)
+                mask_tensor = mask_tensor.scatter_(0, grad_norm_mask[idx].view(-1),
+                                                   cum_sum_pairs.view(-1)).cumsum(0).bool()[:-1]
+
+                param_norm = torch.masked_fill(p.grad.data, mask_tensor, 0).float().norm(norm_type)
 
-            param_norm = p.grad.data.float().norm(norm_type)
+            else:
+                param_norm = p.grad.data.float().norm(norm_type)
             total_norm += param_norm.item()**norm_type
 
         # Sum across all model parallel GPUs.
@@ -814,25 +821,6 @@ def get_only_unique_item(items):
     return unique_item
 
 
-def clip_gradients(parameters, max_norm=1.0, global_grad_norm=None, mpu=None, eps=1e-6):
-    """Clip the gradient of a list of parameters.
-    Args:
-        parameters: List of parameters whose .grad will be clipped.
-        global_grad_norm (float, optional): Precomputed gradient norm. Defaults to None.
-        mpu (optional): model parallelism unit. Defaults to None.
-        eps (float, optional): epsilon value added to grad norm. Defaults to 1e-6
-    Returns:
-        float: the global gradient norm
-    """
-    if global_grad_norm is None:
-        global_grad_norm = get_grad_norm(parameters, mpu=mpu)
-    clip_coef = max_norm / (global_grad_norm + eps)
-    if clip_coef < 1:
-        for p in parameters:
-            p.grad.detach().mul_(clip_coef)
-    return global_grad_norm
-
-
 def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None, use_graph=False, moe_ep_group=None):
     """Get norm of an iterable of tensors.
 

From 34969d2091fcfd37545361111d7ce7d13229d1a3 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Tue, 16 Apr 2024 12:27:45 -0700
Subject: [PATCH 19/36] Update 'create-pr' action in release workflow to latest
 (#5415)

A [warning is
shown](https://github.com/microsoft/DeepSpeed/actions/runs/8695213322/job/23845782048#step:10:31)
when we do releases:

```
[deploy](https://github.com/microsoft/DeepSpeed/actions/runs/8695213322/job/23845782048)
Node.js 16 actions are deprecated. Please update the following actions to use Node.js 20: peter-evans/create-pull-request@v4. For more information see: https://github.blog/changelog/2023-09-22-github-actions-transitioning-from-node-16-to-node-20/.
```

To resolve this we update the create a pull request to `@v6`, see
release notes
[here](https://github.com/peter-evans/create-pull-request/releases)
---
 .github/workflows/release.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 5a931125eff6..2f571a14b228 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -35,7 +35,7 @@ jobs:
       run: |
         python release/bump_patch_version.py --current_version ${{ env.RELEASE_VERSION }}
     - name: Create Pull Request
-      uses: peter-evans/create-pull-request@v4
+      uses: peter-evans/create-pull-request@v6
       with:
         token: ${{ secrets.GH_PAT }}
         add-paths: |

From bc0f77472828323fdc0ae67f62123948bc2b12d1 Mon Sep 17 00:00:00 2001
From: "Etienne.bfx" <etienne.bonnafoux@laposte.net>
Date: Tue, 16 Apr 2024 21:43:44 +0200
Subject: [PATCH 20/36] Update engine.py to avoid torch warning (#5408)

The state_dict function of module.py from torch write a warning if
arguments are positional arguments and not keyword arguments

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: ebonnafoux <ebonnafoux156@headmind.com>
---
 deepspeed/runtime/engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 992d7877c179..9a2b943b0992 100644
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -2542,7 +2542,7 @@ def all_gather_scalar(self, value, dp_group):
         return tensor_list
 
     def module_state_dict(self, destination=None, prefix="", keep_vars=False, exclude_frozen_parameters=False):
-        sd = self.module.state_dict(destination, prefix, keep_vars)
+        sd = self.module.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
 
         # Remove frozen parameter weights from state_dict if specified
         if exclude_frozen_parameters:

From a9cbd688f01c7742397f53144b2223f9956540ac Mon Sep 17 00:00:00 2001
From: Shafiq Jetha <1066864+fasterinnerlooper@users.noreply.github.com>
Date: Tue, 16 Apr 2024 14:49:35 -0600
Subject: [PATCH 21/36] Update _sidebar.scss (#5293)

The right sidebar disappears off of the right side of the page. These
changes will help bring the content back and place it correctly on the
page.

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 docs/_sass/minimal-mistakes/_sidebar.scss | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/_sass/minimal-mistakes/_sidebar.scss b/docs/_sass/minimal-mistakes/_sidebar.scss
index 63cef338c583..312a6279b9b0 100644
--- a/docs/_sass/minimal-mistakes/_sidebar.scss
+++ b/docs/_sass/minimal-mistakes/_sidebar.scss
@@ -76,10 +76,9 @@
 
   @include breakpoint($large) {
     position: absolute;
-    top: 0;
+    top: auto;
     right: 0;
     width: $right-sidebar-width-narrow;
-    margin-right: -1.5 * $right-sidebar-width-narrow;
     padding-left: 1em;
     z-index: 10;
 
@@ -94,7 +93,6 @@
 
   @include breakpoint($x-large) {
     width: $right-sidebar-width;
-    margin-right: -1.5 * $right-sidebar-width;
   }
 }
 

From 4d4c99d9e51afa01a8f4fc7d59a0bc09128abf3e Mon Sep 17 00:00:00 2001
From: Liangliang-Ma <liangliang.ma@intel.com>
Date: Thu, 18 Apr 2024 00:48:28 +0800
Subject: [PATCH 22/36] Add more tests into XPU CI (#5427)

After #5412 merged. We can add more tests into the CI.
---
 .github/workflows/xpu-max1100.yml | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml
index f83c58dc7ff5..34325b4522d7 100644
--- a/.github/workflows/xpu-max1100.yml
+++ b/.github/workflows/xpu-max1100.yml
@@ -61,14 +61,28 @@ jobs:
 
     - name: Unit tests
       run: |
-        pip install pytest pytest-timeout tabulate
+        pip install pytest pytest-timeout tabulate tensorboard wandb
+        export ONEAPI_ROOT=/opt/intel/oneapi/redist
+        export FI_PROVIDER_PATH=$ONEAPI_ROOT/opt/mpi/libfabric/lib/prov
+        export LD_LIBRARY_PATH=$ONEAPI_ROOT/opt/mpi/libfabric/lib:$LD_LIBRARY_PATH
+        export LD_LIBRARY_PATH=$ONEAPI_ROOT/lib:$LD_LIBRARY_PATH
         cd tests/unit
         pytest --verbose accelerator/*
         pytest --verbose autotuning/*
         pytest --verbose checkpoint/test_reshape_checkpoint.py
+        pytest --verbose checkpoint/test_moe_checkpoint.py
+        pytest --verbose checkpoint/test_shared_weights.py
         pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py
+        pytest --verbose moe/test_moe_tp.py
+        pytest --verbose monitor/*
         pytest --verbose runtime/test_ds_config_model.py
         pytest --verbose runtime/pipe/test_pipe_schedule.py
         pytest --verbose runtime/zero/test_zero_config.py
         pytest --verbose runtime/zero/test_zero_tiled.py
         pytest --verbose runtime/zero/test_zeropp.py
+        pytest --verbose runtime/test_autocast.py
+        pytest --verbose runtime/test_data.py
+        pytest --verbose runtime/test_runtime_utils.py
+        pytest --verbose runtime/activation_checkpointing/*
+        pytest --verbose runtime/utils/*
+        pytest --verbose runtime/zero/test_zero_dynamic_class.py

From b22706a7211366abf2df98a0d118ea1d3a837e21 Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@gmail.com>
Date: Thu, 18 Apr 2024 02:52:36 +0800
Subject: [PATCH 23/36] [CPU] Support SHM based inference_all_reduce in
 TorchBackend (#5391)

This PR adds SHM based `inference_all_reduce` kernel to `TorchBackend`
communication backend. When inference on CPU server, this path replaces
default `torch.distributed.all_reduce` which eventurally use gloo
backend. This PR will improve inference performance with AutoTP when
only stock PyTorch is installed without Intel Extension for PyTorch.

Compared with gloo backend. SHM based inference_all_reduce kernel is a
more directed path and perform much better on single node.

| message size | gloo all_reduce(ms) | SHM all_reduce(ms) |
| --- | --- | --- |
| 32MB | 30.7 | 0.65 |
| 64KB | 0.23 | 0.028 |

In text generation of bloom-3b with AutoTP, average token latency
improved 1.45x with this PR on 2S Xeon node.

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 accelerator/cpu_accelerator.py  |   6 +-
 csrc/cpu/comm/ccl.cpp           |   2 +-
 csrc/cpu/comm/shm_interface.cpp | 120 ++++++++++++++++++++++++++++++++
 deepspeed/comm/ccl.py           |   6 +-
 deepspeed/comm/comm.py          |   2 +-
 deepspeed/comm/torch.py         |  20 +++++-
 op_builder/cpu/__init__.py      |   2 +-
 op_builder/cpu/comm.py          |  27 +++++++
 8 files changed, 174 insertions(+), 11 deletions(-)
 create mode 100644 csrc/cpu/comm/shm_interface.cpp

diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
index 870d3e91816e..a0171723cfb8 100644
--- a/accelerator/cpu_accelerator.py
+++ b/accelerator/cpu_accelerator.py
@@ -300,12 +300,14 @@ def get_op_builder(self, class_name):
             # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
             # if successful this also means we're doing a local install and not JIT compile path
             from op_builder import __deepspeed__  # noqa: F401 # type: ignore
-            from op_builder.cpu import CCLCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
+            from op_builder.cpu import CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
         except ImportError:
-            from deepspeed.ops.op_builder.cpu import CCLCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
+            from deepspeed.ops.op_builder.cpu import CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
 
         if class_name == "CCLCommBuilder":
             return CCLCommBuilder
+        elif class_name == "ShareMemCommBuilder":
+            return ShareMemCommBuilder
         elif class_name == "FusedAdamBuilder":
             return FusedAdamBuilder
         elif class_name == "CPUAdamBuilder":
diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp
index 786906717f23..d28509e59266 100644
--- a/csrc/cpu/comm/ccl.cpp
+++ b/csrc/cpu/comm/ccl.cpp
@@ -247,7 +247,7 @@ void all_reduce_caching(torch::Tensor& data,
                  .wait());
 }
 
-void inference_all_reduce(torch::Tensor& data, py::object op, bool async_op)
+void inference_all_reduce(torch::Tensor& data, py::object op)
 {
 #ifdef DO_PROFILE
     static double total_time = 0.0;
diff --git a/csrc/cpu/comm/shm_interface.cpp b/csrc/cpu/comm/shm_interface.cpp
new file mode 100644
index 000000000000..981ea36515fc
--- /dev/null
+++ b/csrc/cpu/comm/shm_interface.cpp
@@ -0,0 +1,120 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <torch/extension.h>
+
+#include "shm.h"
+
+// #define DO_PROFILE
+#ifdef DO_PROFILE
+#include <cfloat>
+#include <chrono>
+#endif
+
+// Communication settings
+static int world_rank = -1;
+static int world_size = -1;
+
+static bool is_initialized = 0;
+
+static bool all_ranks_local_p = false;
+
+void initialize(int size, int rank)
+{
+    if (is_initialized) return;
+
+    // Check whether all ranks is on the same physical machine.
+    // If true, we will use an SHM based low latency allreduce
+
+    auto ls_string = std::getenv("LOCAL_SIZE");
+    int ls = 0;
+    if (ls_string != NULL) { ls = std::stoi(std::getenv("LOCAL_SIZE")); }
+
+    if (size >= 1 && size == ls) { all_ranks_local_p = true; }
+
+    world_size = size;
+    world_rank = rank;
+    is_initialized = 1;
+
+    auto addr_string = std::getenv("MASTER_ADDR");
+    if (addr_string == NULL) { addr_string = ""; }
+    auto port_string = std::getenv("MASTER_PORT");
+    if (port_string == NULL) { port_string = ""; }
+
+    if (all_ranks_local_p) { shm_initialize(size, rank, addr_string, port_string); }
+}
+
+int get_rank(int group = 0) { return world_rank; }
+
+int get_world_size(int group = 0) { return world_size; }
+
+// Success - return 0
+// Fail (cannot hornor the request and need to fall back) - return -1
+int inference_all_reduce(torch::Tensor& data, py::object op)
+{
+    if (!all_ranks_local_p) return -1;
+#ifdef DO_PROFILE
+    static double total_time = 0.0;
+    static double total_time_sq = 0.0;
+    static int count = -16;  // warmup
+    static double max_time = 0.0;
+    static double min_time = DBL_MAX;
+    // make sure all rank reach this point before measuring time
+    // turn on this if you suspect each rank didn't reach here at the same time (stragger)
+    // if (all_ranks_local_p) { barrier_wait(0, world_size); }
+    auto start = std::chrono::system_clock::now();
+#endif
+
+    static py::object ReduceOp = py::module_::import("deepspeed.comm").attr("ReduceOp");
+    static auto ReduceOpSum = (int)py::int_(ReduceOp.attr("SUM").attr("value"));
+
+    assert(py::int_(op.attr("value")) == ReduceOpSum);
+
+    auto numel = data.numel();
+
+    int data_size = 0;
+    bool data_type_fallback = false;
+
+    switch (data.scalar_type()) {
+        case c10::ScalarType::BFloat16: data_size = numel * 2; break;
+        case c10::ScalarType::Float: data_size = numel * 4; break;
+        default: data_type_fallback = true;
+    }
+
+    if (data_type_fallback) return -1;
+
+    all_reduce_outer_loop(data, numel, data_size);
+
+#ifdef DO_PROFILE
+    auto end = std::chrono::system_clock::now();
+    count++;
+    if (count > 0) {
+        double elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        if (elapsed > max_time) { max_time = elapsed; }
+        if (elapsed < min_time) { min_time = elapsed; }
+        total_time += elapsed;
+        total_time_sq += elapsed * elapsed;
+        if (world_rank == 0 && count == 1000) {
+            auto avg = total_time / count;
+            auto sd =
+                sqrt(total_time_sq / count - total_time * total_time / (count * count)) / avg * 100;
+            printf("      C++ kernel\t\t    %.2f\t  %.2f\t%.2f\t      %.2f\n",
+                   min_time,
+                   max_time,
+                   total_time / count,
+                   sd);
+        }
+    }
+#endif
+    return 0;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("initialize", &initialize, "shm initialize");
+    m.def("get_rank", &get_rank, "get rank");
+    m.def("get_world_size", &get_world_size, "get world size");
+    m.def("inference_all_reduce", &inference_all_reduce, "low latency all_reduce implementation");
+}
diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py
index a199843d5086..f2a6cb6a36dc 100644
--- a/deepspeed/comm/ccl.py
+++ b/deepspeed/comm/ccl.py
@@ -98,12 +98,12 @@ def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False):
             else:
                 return self.run_collective(name=name, tensor=tensor, op=op, group=group, async_op=async_op)
 
-    def inference_all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False):
+    def inference_all_reduce(self, tensor, op=ReduceOp.SUM, group=None):
         name = "inference_all_reduce"
         if name in self.available_coll:
-            return self.ccl_comm_op.inference_all_reduce(tensor, op, async_op)
+            return self.ccl_comm_op.inference_all_reduce(tensor, op)
         else:
-            return self.run_collective(name=name, tensor=tensor, op=op, group=None, async_op=async_op)
+            return self.run_collective(name=name, tensor=tensor, op=op, group=None, async_op=False)
 
     def broadcast(self, tensor, src, group=None, async_op=False):
         return self.run_collective(name="broadcast", tensor=tensor, src=src, group=group, async_op=async_op)
diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py
index c92ae74375f1..4f826edab3d6 100644
--- a/deepspeed/comm/comm.py
+++ b/deepspeed/comm/comm.py
@@ -505,7 +505,7 @@ def inference_all_reduce(tensor,
                          log_name='all_reduce',
                          debug=get_caller_func()):
     global cdb
-    return cdb.inference_all_reduce(tensor, op, group, async_op)
+    return cdb.inference_all_reduce(tensor, op, group)
 
 
 @timed_op
diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py
index 35c31f9fe2ec..14d4f3847315 100644
--- a/deepspeed/comm/torch.py
+++ b/deepspeed/comm/torch.py
@@ -3,6 +3,7 @@
 
 # DeepSpeed Team
 
+import deepspeed
 from deepspeed import utils
 
 from .utils import *
@@ -19,6 +20,15 @@
 DS_COMM_REDUCE_OFF = False
 
 
+def build_shm_op():
+    builder = get_accelerator().create_op_builder("ShareMemCommBuilder")
+    if builder is None or not deepspeed.ops.__compatible_ops__[builder.NAME]:
+        return None
+    shm_cpp_module = builder.load()
+    print(f'DeepSpeed {builder.absolute_name()} built successfully')
+    return shm_cpp_module
+
+
 def has_coalescing_manager():
     has_c10d = hasattr(torch.distributed, 'distributed_c10d')
     return has_c10d and hasattr(torch.distributed.distributed_c10d, '_coalescing_manager')
@@ -88,6 +98,7 @@ class TorchBackend(Backend):
 
     def __init__(self, backend, timeout, init_method, rank=-1, world_size=-1, name='torch'):
         super(TorchBackend, self).__init__()
+        self.shm_comm_op = build_shm_op()
         self.has_all_reduce_coalesced = has_all_reduce_coalesced()
         self.has_coalescing_manager = has_coalescing_manager()
         self.all_gather_function = self.get_all_gather_function()
@@ -99,6 +110,8 @@ def __init__(self, backend, timeout, init_method, rank=-1, world_size=-1, name='
         # it is not so we can run on a single GPU without doing any init_process_group
         self.single_gpu_mode = True
         self.init_process_group(backend, timeout, init_method, rank, world_size)
+        if self.shm_comm_op != None:
+            self.shm_comm_op.initialize(self.get_world_size(), self.get_rank())
 
     @classmethod
     @compiler.disable
@@ -139,9 +152,10 @@ def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, asyn
         return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op)
 
     @compiler.disable
-    def inference_all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
-        op = self._reduce_op(op)
-        return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op)
+    def inference_all_reduce(self, tensor, op, group=None):
+        if self.shm_comm_op == None or self.shm_comm_op.inference_all_reduce(tensor, op) == -1:
+            op = self._reduce_op(op)
+            return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=False)
 
     @compiler.disable
     def all_reduce_coalesced(self, tensors, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
diff --git a/op_builder/cpu/__init__.py b/op_builder/cpu/__init__.py
index 0ecf2caf65ae..30238add3f90 100644
--- a/op_builder/cpu/__init__.py
+++ b/op_builder/cpu/__init__.py
@@ -4,7 +4,7 @@
 # DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
 
-from .comm import CCLCommBuilder
+from .comm import CCLCommBuilder, ShareMemCommBuilder
 from .fused_adam import FusedAdamBuilder
 from .cpu_adam import CPUAdamBuilder
 from .no_impl import NotImplementedBuilder
diff --git a/op_builder/cpu/comm.py b/op_builder/cpu/comm.py
index 38e965530f43..69b9226fd806 100644
--- a/op_builder/cpu/comm.py
+++ b/op_builder/cpu/comm.py
@@ -42,3 +42,30 @@ def extra_ldflags(self):
             return []
         else:
             return ['-lccl', f'-L{ccl_root_path}/lib']
+
+
+class ShareMemCommBuilder(CPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_SHM_COMM"
+    NAME = "deepspeed_shm_comm"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.comm.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/cpu/comm/shm_interface.cpp', 'csrc/cpu/comm/shm.cpp']
+
+    def include_paths(self):
+        includes = ['csrc/cpu/includes']
+        return includes
+
+    def cxx_args(self):
+        return ['-O2', '-fopenmp']
+
+    def is_compatible(self, verbose=True):
+        # TODO: add soft compatibility check for private binary release.
+        #  a soft check, as in we know it can be trivially changed.
+        return super().is_compatible(verbose)

From 3194fe85c5b62940f5479deba0396121f2d87d59 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Thu, 18 Apr 2024 09:27:07 -0700
Subject: [PATCH 24/36] Add required paths to trigger AMD tests on PRs (#5406)

Even though AMD tests are currently broken, this will at least trigger
them on PRs that touch files that might impact them. Since the test name
is listed as `amd-tests` rather than `unit-tests` they will currently
not be required, however.

Co-authored-by: root <root@loadams-dev.redmond.corp.microsoft.com>
---
 .github/workflows/amd-mi200.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml
index 00ff72ac8929..cd1cafe8e679 100644
--- a/.github/workflows/amd-mi200.yml
+++ b/.github/workflows/amd-mi200.yml
@@ -2,6 +2,10 @@ name: amd-mi200
 
 on:
   workflow_dispatch:
+  pull_request:
+    paths:
+      - '.github/workflows/amd-mi200.yml'
+      - 'requirements/**'
   schedule:
     - cron: "0 0 * * *"
 

From aaaf8bc5e07535e263f83733f8905400bf6f5aca Mon Sep 17 00:00:00 2001
From: Bruno Magalhaes <bruno.magalhaes@synthesia.io>
Date: Thu, 18 Apr 2024 20:39:07 +0200
Subject: [PATCH 25/36] Bug fix in `split_index` method (#5292)

Bug description: on a dataset of 20 samples, when running 4 workers with
8 threads per worker, then the `split_dataset` would return for worker
id `1`:

```
self.worker_splits
[[0, 5], [5, 10], [10, 15], [15, 20]]


self.thread_splits
[[5, 6], [6, 7], [7, 8], [8, 9], [9, 10], [10, 10], [11, 10], [12, 10]]
```

`thread_splits` is wrong and causes a crash in the `DataAnalyzer`: the
end sample id is lower than the initial one on the last 2 threads.
This PR fixes that by fixing the behaviour of `split_index`

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/runtime/data_pipeline/data_sampling/utils.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/deepspeed/runtime/data_pipeline/data_sampling/utils.py b/deepspeed/runtime/data_pipeline/data_sampling/utils.py
index 9c643f3705de..dc55f96e222d 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/utils.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/utils.py
@@ -3,7 +3,6 @@
 
 # DeepSpeed Team
 
-import math
 import numpy as np
 
 from deepspeed.utils import logger
@@ -32,10 +31,8 @@ def find_fit_int_dtype(min_value, max_value):
 
 
 def split_index(start_idx, end_idx, num_partitions):
-    partition_size = math.ceil((end_idx - start_idx) / num_partitions)
-    partitions = [[start_idx + x * partition_size,
-                   min(end_idx, start_idx + (x + 1) * partition_size)] for x in range(num_partitions)]
-    return partitions
+    partition_boundaries = np.linspace(start_idx, end_idx, dtype=int, num=num_partitions + 1)
+    return [(partition_boundaries[i], partition_boundaries[i + 1]) for i in range(num_partitions)]
 
 
 def split_dataset(dataset, num_workers, worker_id, num_threads):

From 64defe65b73f856466935befc49fb188756ab558 Mon Sep 17 00:00:00 2001
From: Bruno Magalhaes <bruno.magalhaes@synthesia.io>
Date: Thu, 18 Apr 2024 23:14:08 +0200
Subject: [PATCH 26/36] Parallel map step for `DistributedDataAnalyzer`
 map-reduce (#5291)

- adds multi CPU-processing to the `DistributedDataAnalyzer` map
operation (parallelism set with parameter `num_workers`). Works with a
`SharedMemory` / `Manager's` queue per metric, written concurrently by
processes.
- much faster `write_buffer_to_file` in `DistributedDataAnalyzer` reduce
operation by copying to cpu and "detaching" output tensor.

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Conglong Li <conglong.li@gmail.com>
---
 .../data_sampling/data_analyzer.py            | 128 ++++++++++++------
 .../data_sampling/indexed_dataset.py          |   6 +-
 2 files changed, 93 insertions(+), 41 deletions(-)

diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py
index 7088df223bd8..93d351169834 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py
@@ -4,6 +4,7 @@
 # DeepSpeed Team
 
 import os
+import sys
 from collections import defaultdict
 import csv
 import time
@@ -12,8 +13,8 @@
 import torch
 from torch.utils.data import BatchSampler, SequentialSampler, DataLoader, Subset
 
-from deepspeed.utils import logger
 import deepspeed.comm as dist
+from deepspeed.utils import logger
 from deepspeed.runtime.data_pipeline.data_sampling.indexed_dataset import MMapIndexedDataset, valid_dtypes
 from deepspeed.runtime.data_pipeline.data_sampling.utils import split_dataset, split_index, create_mmap_dataset_builder, close_mmap_dataset_builder, find_fit_int_dtype
 
@@ -457,6 +458,7 @@ def __init__(
         self,
         dataset,
         num_workers=1,
+        num_threads=1,
         worker_id=0,
         batch_size=1,
         metric_names=[],
@@ -477,6 +479,8 @@ def __init__(
         self.collate_fn = collate_fn
         self.device = device
         self.sample_indices = sample_indices
+        self.num_threads = num_threads
+        self.worker_id = worker_id
 
         if not dist.is_initialized():
             dist.init_distributed()
@@ -494,13 +498,9 @@ def __init__(
         if self.worker_id == 0:
             logger.info(f"Distributed data analyzer initialized with {self.num_workers} workers.")
 
-    def run_map_reduce(self):
-
-        # setup individual dataloaders
-        worker_splits, _ = split_dataset(self.dataset, self.num_workers, self.worker_id, num_threads=1)
-        start_idx, end_idx = worker_splits[self.worker_id]
-        logger.info(f"worker {self.worker_id}: start working on data subset {start_idx} to {end_idx}")
-        worker_dataset = Subset(self.dataset, list(range(start_idx, end_idx)))
+    def run_map_helper(self, thread_id=0, metric_queues=None):
+        thread_start_idx, thread_end_idx = self.thread_splits[thread_id][0], self.thread_splits[thread_id][1]
+        worker_dataset = Subset(self.dataset, list(range(thread_start_idx, thread_end_idx)))
         sampler = BatchSampler(SequentialSampler(worker_dataset), batch_size=self.batch_size, drop_last=False)
         dataloader = DataLoader(dataset=worker_dataset,
                                 batch_sampler=sampler,
@@ -516,7 +516,7 @@ def run_map_reduce(self):
             metric_results.append([] if metric_type == 'single_value_per_sample' else None)
 
         # iterate dataloader and store metric results
-        batch_start_idx = start_idx
+        batch_start_idx = thread_start_idx
         for data in dataloader:
             for m_idx in range(len(self.metric_names)):
                 metric_type, metric_function = self.metric_types[m_idx], self.metric_functions[m_idx]
@@ -544,15 +544,73 @@ def run_map_reduce(self):
                         metric_results[m_idx].add_(metric_values)
             batch_start_idx += len(data)
 
+        if self.num_threads == 1:
+            return metric_results
+
+        # copy metric_results to the shared queue
+        assert metric_queues
+        for m_idx in range(len(self.metric_names)):
+            results = metric_results[m_idx]
+            if torch.is_tensor(results):
+                results = results.item() if results.dim() == 0 else results.tolist()
+            try:
+                metric_queues[m_idx].put((thread_id, results))
+            except Exception as e:
+                logger.error(f"Error putting metric results to queue: {e}")
+                sys.exit(1)
+
+    def run_map_reduce(self):
+
+        # setup individual dataloaders
+        self.worker_splits, self.thread_splits = split_dataset(self.dataset,
+                                                               self.num_workers,
+                                                               self.worker_id,
+                                                               num_threads=self.num_threads)
+        node_start_idx, node_end_idx = self.worker_splits[self.worker_id]
+        logger.info(f"worker {self.worker_id} working on data subset {node_start_idx} to {node_end_idx}.")
+
+        if self.num_threads in [0, 1, None]:
+            metric_results = self.run_map_helper()
+            metric_results = [torch.tensor(m).to(self.device) for m in metric_results]
+        else:
+
+            # create a shared queue of results per metric to be populated by individual threads
+            with Manager() as manager:
+                metric_queues = [manager.Queue() for _ in self.metric_names]
+                threads = [
+                    Process(target=self.run_map_helper, args=(t, metric_queues)) for t in range(self.num_threads)
+                ]
+                for thread in threads:
+                    thread.start()
+                for thread in threads:
+                    thread.join()
+
+                # gather results from shared queues into metric_results
+                metric_results = [None for _ in self.metric_names]
+                for m_idx, (queue, metric_type) in enumerate(zip(metric_queues, self.metric_types)):
+                    while not queue.empty():
+                        t_idx, t_results = queue.get()
+                        t_start_idx, t_end_idx = self.thread_splits[t_idx]
+                        if t_start_idx >= t_end_idx:  # no results from this thread
+                            continue  #corner case for small datasets and high thread count
+                        t_results = torch.tensor(t_results)
+                        if metric_type == 'single_value_per_sample':
+                            # add thread results to the metric_results list, ordered by thread idx
+                            if metric_results[m_idx] is None:  # initialize if needed
+                                metric_results[m_idx] = torch.zeros(node_end_idx - node_start_idx,
+                                                                    t_results.size(1)).to(self.device)
+                            metric_results[m_idx][t_start_idx - node_start_idx:t_end_idx - node_start_idx] = t_results
+                        else:
+                            if metric_results[m_idx] is None:  # initialize if needed
+                                metric_results[m_idx] = torch.zeros(t_results.size()).to(self.device)
+                            metric_results[m_idx].add_(t_results)
+
         # compute dtype for sample ids
         total_num_samples = len(self.dataset)
         sample_idx_dtype = find_fit_int_dtype(0, total_num_samples - 1)
         logger.info(f"Total number of data samples: {total_num_samples}.")
         logger.info(f"Will use {sample_idx_dtype} to store the sample indexes.")
 
-        # convert to list of tensors
-        metric_results = [torch.tensor(m).to(self.device) for m in metric_results]
-
         for m_idx in range(len(self.metric_names)):
             metric_values, metric_name, metric_type = \
                 metric_results[m_idx], self.metric_names[m_idx], self.metric_types[m_idx]
@@ -611,8 +669,8 @@ def run_map_reduce(self):
     def file_write_ordered(self, tensor_list, fname, numpy_dtype):
         """ MPI_file_write_ordered extended to write a list of tensors, by one rank, iteratively """
 
-        # each not has a list of rows (tensors) to be written to the file.
-        # we will serialize it to communicate it in one comm step.
+        # each node has a list of rows (tensors) to be written to the file.
+        # we will serialize it in order to communicate it in one comm step.
 
         tkwargs = dict(dtype=torch.int64, device=self.device)
 
@@ -636,17 +694,13 @@ def file_write_ordered(self, tensor_list, fname, numpy_dtype):
         def write_buffer_to_file(buff, src, builder):
             assert self.worker_id == 0, "only rank 0 can write to file"
 
-            # # write one buffer at a time
-            # for row_len in row_lens[src]:
-            #     builder.add_item(buff[:row_len].cpu())
-            #     buff = buff[row_len:]
-
-            # collect all buffers and write them all at once
-            buffer_list = []
-            for row_len in row_lens[src]:
-                buffer_list.append(buff[:row_len].cpu())
-                buff = buff[row_len:]
-            builder.add_items(buffer_list)
+            # collect all buffers and write them at once
+            buff = buff.cpu().detach().numpy()
+            row_offsets = np.cumsum([0] + row_lens[src].tolist())
+            arr_list = []
+            for i in range(len(row_lens[src])):
+                arr_list.append(buff[row_offsets[i]:row_offsets[i + 1]])
+            builder.add_items(arr_list)
 
         # 5. rank 0 prepares output folder and file
         if self.worker_id == 0:
@@ -700,7 +754,7 @@ def gather_v(tensor, dst, comm_group, num_workers, worker_id):
         # all_gather requires all tensors to be of same size so we need to pad them
         max_size = max(sizes).item()
         buffer = torch.empty(max_size, dtype=tensor.dtype, device=tensor.device)
-        buffer[0:size] = torch.tensor(tensor, dtype=tensor.dtype, device=tensor.device)
+        buffer[0:size] = tensor.data
         buffer_list = None
         if worker_id == 0:  # create padded recv buffers
             buffer_list = [torch.empty(max_size, dtype=tensor.dtype, device=tensor.device) for _ in range(num_workers)]
@@ -763,16 +817,18 @@ def sample_sort(tensor, comm_group, num_workers, n_samples=100):
 def test_compare_both_data_analyzers(dataset):
     """ given a dataset, compare file and memory based data analyser"""
 
-    id = lambda t: torch.tensor(t).to(torch.int64)  # identity
+    id = lambda t: t.to(torch.int64)  # identity
     batch_sum = lambda t: id(t).sum()  #sum batch
+    num_threads = 4
     kwargs = dict(
         dataset=dataset,
-        batch_size=3,
+        batch_size=2**10,
         worker_id=int(os.environ['RANK']),
         num_workers=int(os.environ['WORLD_SIZE']),
         metric_names=["mod", "batch_sum"],
         metric_functions=[id, batch_sum],
         metric_types=['single_value_per_sample', 'accumulate_value_over_samples'],
+        num_threads=num_threads,
     )
 
     dda = DistributedDataAnalyzer(
@@ -785,10 +841,9 @@ def test_compare_both_data_analyzers(dataset):
     if dda.worker_id == 0:
         print("DistributedDataAnalyzer runtime: %s seconds " % (time.time() - start_time))
 
-    da = DataAnalyzer(num_threads=2,
-                      num_threads_reduce=2,
-                      metric_dtypes=[torch.int64, torch.int64],
+    da = DataAnalyzer(num_threads_reduce=num_threads,
                       save_path="./output_disk",
+                      metric_dtypes=[torch.int64, torch.int64],
                       **kwargs)
     start_time = time.time()
     da.run_map_reduce()
@@ -815,14 +870,11 @@ def test_compare_both_data_analyzers(dataset):
 
     class TestDataset(torch.utils.data.Dataset):
 
-        def __init__(self, size=20):
-            self.values = [1001 + x % 6 for x in range(size)]
+        def __init__(self, size=10_000_000):
+            self.values = [(x + 7) % 10_000 for x in range(size)]
             self.size = size
 
-        def __len__(self):
-            return self.size
-
-        def __getitem__(self, idx):
-            return self.values[idx]
+        __len__ = lambda self: self.size
+        __getitem__ = lambda self, idx: self.values[idx]
 
     test_compare_both_data_analyzers(TestDataset())
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py
index 453e6ba6039d..872d05de0145 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py
@@ -586,9 +586,9 @@ def add_item(self, tensor):
         self._data_file.write(np_array.tobytes(order='C'))
         self._sizes.append(np_array.size)
 
-    def add_items(self, tensor_list):
-        """ write a list of tensors to the file and update their sizes in the index"""
-        np_arrays = [np.array(t.numpy(), dtype=self._dtype) for t in tensor_list]
+    def add_items(self, arr_list):
+        """ write a list of arrays to the file and update their sizes in the index"""
+        np_arrays = [arr.astype(self._dtype) for arr in arr_list]
         self._data_file.writelines([arr.tobytes(order='C') for arr in np_arrays])
         for arr in np_arrays:
             self._sizes.append(arr.size)

From c632ea09f8d107d10f76aa2b776e4df3c1ccf98a Mon Sep 17 00:00:00 2001
From: Reza Yazdani <44502768+RezaYazdaniAminabadi@users.noreply.github.com>
Date: Fri, 19 Apr 2024 08:58:27 -0700
Subject: [PATCH 27/36] Selective dequantization (#5375)

This PR adds a new functionality for the dequantizer function, called
`selective_dequantize`, which enables partially dequantizing a
3-dimensional matrix in case we don't need to dequantize all the data
from lower bit (like fp8/fp6) to bf16.
I also added a unit test to check its functionality.

---------

Co-authored-by: Reza Yazdani <reza.yazdani@snowflake.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 csrc/fp_quantizer/includes/quantize.h        |  11 ++
 csrc/fp_quantizer/quantize.cpp               |  31 +++++
 csrc/fp_quantizer/quantize.cu                | 135 +++++++++++++++++++
 deepspeed/ops/fp_quantizer/quantize.py       |  35 +++++
 tests/unit/ops/fp_quantizer/test_fp_quant.py |  29 ++++
 5 files changed, 241 insertions(+)

diff --git a/csrc/fp_quantizer/includes/quantize.h b/csrc/fp_quantizer/includes/quantize.h
index 2204c1ba74fc..507252d6e722 100644
--- a/csrc/fp_quantizer/includes/quantize.h
+++ b/csrc/fp_quantizer/includes/quantize.h
@@ -113,3 +113,14 @@ void launch_dequantization(uint8_t* val,
                            int q_mantisa_bits,
                            int q_exponent_bits,
                            cudaStream_t stream);
+
+template <typename T, int mantisa>
+void launch_selective_dequantization(uint8_t* val,
+                                     T* q_val,
+                                     int32_t* indexes,
+                                     int num_groups,
+                                     int group_size,
+                                     int num_indexes,
+                                     int q_mantisa_bits,
+                                     int q_exponent_bits,
+                                     cudaStream_t stream);
diff --git a/csrc/fp_quantizer/quantize.cpp b/csrc/fp_quantizer/quantize.cpp
index 4a88ff767636..ec631c576e27 100644
--- a/csrc/fp_quantizer/quantize.cpp
+++ b/csrc/fp_quantizer/quantize.cpp
@@ -78,8 +78,39 @@ void dequantize(torch::Tensor& val,
 #endif
 }
 
+#define DISPATCH_DEQUANTIZE_INDEX(T_TYPE, C_TYPE, mantisa)                                  \
+    if (val.options().dtype() == torch::T_TYPE) {                                           \
+        launch_selective_dequantization<C_TYPE, mantisa>((uint8_t*)val_q.data_ptr(),        \
+                                                         (C_TYPE*)val.data_ptr(),           \
+                                                         (int32_t*)indexes.data_ptr(),      \
+                                                         num_groups,                        \
+                                                         group_size,                        \
+                                                         num_indexes,                       \
+                                                         q_mantisa_bits,                    \
+                                                         q_exponent_bits,                   \
+                                                         at::cuda::getCurrentCUDAStream()); \
+        return;                                                                             \
+    }
+void selective_dequantize(torch::Tensor& val,
+                          torch::Tensor& val_q,
+                          torch::Tensor& indexes,
+                          int group_size,
+                          int q_mantisa_bits,
+                          int q_exponent_bits)
+{
+    int total_elems = at::numel(val);
+    int num_indexes = indexes.size(0);
+    int num_groups = total_elems / group_size;
+
+    DISPATCH_DEQUANTIZE_INDEX(kHalf, __half, 10);
+#ifdef BF16_AVAILABLE
+    DISPATCH_DEQUANTIZE_INDEX(kBFloat16, __nv_bfloat16, 7);
+#endif
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("quantize", &quantize, "quantize function");
     m.def("dequantize", &dequantize, "dequantize function");
+    m.def("selective_dequantize", &selective_dequantize, "selective dequantize function");
 }
diff --git a/csrc/fp_quantizer/quantize.cu b/csrc/fp_quantizer/quantize.cu
index 5f0b58f124f0..5ada6894747f 100644
--- a/csrc/fp_quantizer/quantize.cu
+++ b/csrc/fp_quantizer/quantize.cu
@@ -270,6 +270,7 @@ __global__ void apply_dequantization(uint8_t* val, T* q_val, int group_size, int
             mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
                 int8_data + quantization::quanitzed_access_granularity_6bits * 2,
                 load_base_ptr + quantization::quanitzed_access_granularity_6bits * 2);
+
         } else {
             mem_access::load_global<quantization::quanitzed_access_granularity>(int8_data,
                                                                                 load_base_ptr);
@@ -393,3 +394,137 @@ void launch_dequantization(uint8_t* val,
 INSTANTIATE_LAUNCH_DEQUANTIZATION(__nv_bfloat16, 7);
 #endif
 INSTANTIATE_LAUNCH_DEQUANTIZATION(__half, 10);
+
+template <typename T,
+          int q_mantisa_bits,
+          int total_q_bits = 16,
+          int _mantisa_bits = 3,
+          int _exponent_bits = 4>
+__global__ void apply_selective_dequantization(uint8_t* val,
+                                               T* q_val,
+                                               int32_t* indexes,
+                                               int group_size,
+                                               int total_num_elements)
+{
+    int index = indexes[blockIdx.x];
+    constexpr uint32_t vector_size = quantization::access_granularity / sizeof(T);
+    int tidx = (blockIdx.y * blockDim.x + threadIdx.x) * vector_size;
+    int input_index = index * total_num_elements + tidx;
+    constexpr int quantized_bits = _mantisa_bits + _exponent_bits + 1;
+    constexpr int q_exponent_bits = total_q_bits - q_mantisa_bits - 1;
+    constexpr uint16_t _mantisa_mask = (1 << _mantisa_bits) - 1;
+    constexpr uint16_t _exponent_mask = ((1 << _exponent_bits) - 1) << _mantisa_bits;
+    constexpr uint16_t _sign_mask = 1 << (_mantisa_bits + _exponent_bits);
+    const uint32_t g_index = (input_index / group_size);
+    const uint32_t group_size_bytes = (group_size * quantized_bits / 8);
+    const uint8_t* load_base_ptr =
+        val + g_index * (group_size_bytes + 4) + (input_index % group_size) * quantized_bits / 8;
+
+    int mantisa_mask = ((1 << q_mantisa_bits) - 1);
+    mantisa_mask <<= (_mantisa_bits - q_mantisa_bits);
+
+    T* store_base_ptr = q_val + tidx + blockIdx.x * total_num_elements;
+    float scale;
+
+    uint8_t* scale_as_int8 = reinterpret_cast<uint8_t*>(&scale);
+    if (quantized_bits == 6) {
+        mem_access::load_global<quantization::quanitzed_access_granularity>(
+            scale_as_int8, val + g_index * (group_size_bytes + 4) + group_size_bytes);
+        mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
+            scale_as_int8 + quantization::quanitzed_access_granularity_6bits,
+            val + g_index * (group_size_bytes + 4) + group_size_bytes +
+                quantization::quanitzed_access_granularity_6bits);
+    } else
+        mem_access::load_global<quantization::quanitzed_access_granularity>(
+            scale_as_int8, val + g_index * (group_size_bytes + 4) + group_size_bytes);
+
+    if (tidx < total_num_elements) {
+        uint64_t q_buf_in;
+        uint64_t q_buf_in1;
+        uint8_t* int8_data = reinterpret_cast<uint8_t*>(&q_buf_in);
+        uint8_t* int8_data1 = reinterpret_cast<uint8_t*>(&q_buf_in1);
+        if (quantized_bits == 6) {
+            mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
+                int8_data, load_base_ptr);
+            mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
+                int8_data + quantization::quanitzed_access_granularity_6bits,
+                load_base_ptr + quantization::quanitzed_access_granularity_6bits);
+            mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
+                int8_data + quantization::quanitzed_access_granularity_6bits * 2,
+                load_base_ptr + quantization::quanitzed_access_granularity_6bits * 2);
+        } else {
+            mem_access::load_global<quantization::quanitzed_access_granularity>(int8_data,
+                                                                                load_base_ptr);
+            if (quantized_bits > 4) {
+                mem_access::load_global<quantization::quanitzed_access_granularity>(
+                    int8_data + quantization::quanitzed_access_granularity,
+                    load_base_ptr + quantization::quanitzed_access_granularity);
+                if (quantized_bits == 12) {
+                    mem_access::load_global<quantization::quanitzed_access_granularity>(
+                        int8_data1, load_base_ptr + quantization::quanitzed_access_granularity * 2);
+                }
+            }
+        }
+        T store_buf[vector_size];
+        uint16_t* q_buf = reinterpret_cast<uint16_t*>(store_buf);
+#pragma unroll
+        for (int j = 0; j < vector_size; j++) {
+            uint16_t new_data;
+            if (j < 5 || quantized_bits != 12) {
+                new_data = (uint16_t)(q_buf_in >> (j * quantized_bits));
+            } else {
+                if (j == 5) {
+                    new_data = (uint16_t)(q_buf_in1);
+                    new_data = (uint16_t)((new_data << 4) | (q_buf_in >> 60));
+                } else
+                    new_data = (uint16_t)(q_buf_in1 >> ((j - 6) * quantized_bits + 8));
+            }
+
+            uint16_t sign = (new_data & _sign_mask) >> (_mantisa_bits + _exponent_bits);
+            uint16_t dst_exponent = (new_data & _exponent_mask) >> _mantisa_bits;
+            uint16_t dst_mantisa = (new_data & _mantisa_mask);
+
+            if (dst_exponent != (1 << q_exponent_bits) - 1)
+                dst_exponent = (dst_exponent - ((1 << (_exponent_bits - 1)) - 1)) +
+                               (1 << (q_exponent_bits - 1)) - 1;
+
+            q_buf[j] =
+                ((sign << (q_exponent_bits + q_mantisa_bits)) | (dst_exponent << q_mantisa_bits) |
+                 (dst_mantisa << (q_mantisa_bits - _mantisa_bits)));
+            float up_cast = conversion::to<float>(store_buf[j]);
+            store_buf[j] = conversion::to<T>(up_cast * scale);
+        }
+        mem_access::store_global<quantization::access_granularity>(store_base_ptr, store_buf);
+    }
+}
+
+template <typename T, int mantisa>
+void launch_selective_dequantization(uint8_t* val,
+                                     T* q_val,
+                                     int32_t* indexes,
+                                     int num_groups,
+                                     int group_size,
+                                     int num_indexes,
+                                     int q_mantisa_bits,
+                                     int q_exponent_bits,
+                                     cudaStream_t stream)
+{
+    int total_elements_per_index = (num_groups / num_indexes) * group_size;
+    int blocks = (total_elements_per_index - 1) /
+                     (quantization::threads * (quantization::access_granularity / sizeof(T))) +
+                 1;
+    const dim3 grid(num_indexes, blocks);
+    const dim3 block(quantization::threads);
+    DEQUANT_SWITCH(q_mantisa_bits * q_exponent_bits, [&] {
+        apply_selective_dequantization<T, mantisa, 16, CONST_Q_MANTISA_BITS, CONST_Q_EXPONENT_BITS>
+            <<<grid, block, 0, stream>>>(val, q_val, indexes, group_size, total_elements_per_index);
+    });
+}
+#define INSTANTIATE_LAUNCH_SELECTIVE_DEQUANTIZATION(T, mantisa) \
+    template void launch_selective_dequantization<T, mantisa>(  \
+        uint8_t*, T*, int32_t*, int, int, int, int, int, cudaStream_t);
+// fp8(E4M3)
+#ifdef BF16_AVAILABLE
+INSTANTIATE_LAUNCH_SELECTIVE_DEQUANTIZATION(__nv_bfloat16, 7);
+#endif
+INSTANTIATE_LAUNCH_SELECTIVE_DEQUANTIZATION(__half, 10);
diff --git a/deepspeed/ops/fp_quantizer/quantize.py b/deepspeed/ops/fp_quantizer/quantize.py
index 5dc3c190ae5d..0d4bf7bc6db1 100644
--- a/deepspeed/ops/fp_quantizer/quantize.py
+++ b/deepspeed/ops/fp_quantizer/quantize.py
@@ -77,3 +77,38 @@ def dequantize(self, input_q, fp_out=None, q_bits=8, q_mantisa_bits=3, scale=Non
 
         fp_quant_module.dequantize(fp_out, input_q, self.group_size, q_mantisa_bits, q_bits - q_mantisa_bits - 1)
         return fp_out
+
+    def selective_dequantize(self,
+                             input_q,
+                             indexes,
+                             fp_out=None,
+                             q_bits=8,
+                             q_mantisa_bits=3,
+                             scale=None) -> torch.Tensor:
+        assert (not hasattr(self, 'orig_shape') or len(self.orig_shape) == 3), \
+            "Selective-Dequantization works on 3d tensor only! Please reshape the tensor before calling dequantize function."
+        assert (self.orig_dtype is not None), \
+            "[De-quantization Error]: you need to call quantize before dequantizing!"
+        fp_out = torch.empty(
+            (indexes.shape[0],
+             *self.orig_shape[1:]), dtype=self.orig_dtype, device=input_q.device) if fp_out is None else fp_out
+        if q_bits == 8:
+            pass
+        elif q_bits == 12:
+            q_mantisa_bits = 4
+        elif q_bits == 6:
+            q_mantisa_bits = 2
+        elif q_bits == 4:
+            q_mantisa_bits = 1
+        else:
+            assert (0), \
+                f"Missing {q_bits}-dequantization, please add the template arguments for the kernel to support this precision!"
+
+        if scale is not None:
+            assert input_q.numel() == fp_out.numel(), \
+            f'[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!'
+            input_q = torch.cat([input_q.reshape(-1, self.group_size), scale], dim=-1).contiguous()
+
+        fp_quant_module.selective_dequantize(fp_out, input_q, indexes, self.group_size, q_mantisa_bits,
+                                             q_bits - q_mantisa_bits - 1)
+        return fp_out
diff --git a/tests/unit/ops/fp_quantizer/test_fp_quant.py b/tests/unit/ops/fp_quantizer/test_fp_quant.py
index 101f4cd69811..bed8bd7e3bcc 100644
--- a/tests/unit/ops/fp_quantizer/test_fp_quant.py
+++ b/tests/unit/ops/fp_quantizer/test_fp_quant.py
@@ -61,6 +61,35 @@ def test_fp_quant_meta(dtype):
         assert 0.0004 > abs(qtorch_error.item() - ds_error.item()), f"failed on iteration {i}"
 
 
+@pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"])
+def test_fp_quant_selective(dtype):
+    group_size = 128
+    q_bits = 8
+    exp_bits = 4
+    man_bits = 3
+
+    fpq = FP_Quantize(group_size=group_size)
+    indexes = torch.zeros(2, dtype=torch.int32, device='cuda')
+    indexes[0] = 1
+    indexes[1] = 3
+    for i in range(10):
+        x = torch.rand(4, 1024, dtype=dtype, device='cuda')
+
+        x = x.reshape(4, 1, x.shape[-1])
+        ds_x = x.clone()
+        x_quantized = fpq.quantize(ds_x, q_bits=q_bits)
+        x_dequantized = fpq.selective_dequantize(x_quantized, indexes, q_bits=q_bits)
+
+        qtorch_out = qtorch_quantize(x.index_select(0, indexes),
+                                     exp_bits=exp_bits,
+                                     man_bits=man_bits,
+                                     group_size=group_size)
+        qtorch_error = (qtorch_out - x.index_select(0, indexes)).abs().sum() / x.numel()
+        ds_error = (x_dequantized - x.index_select(0, indexes)).abs().sum() / x.numel()
+
+        assert 0.0004 > abs(qtorch_error.item() - ds_error.item()), f"failed on iteration {i}"
+
+
 @pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"])
 @pytest.mark.parametrize("q_bits", [8, 6, 12], ids=["qbits8", "qbits6", "qbits12"])
 def test_fp_quant(dtype, q_bits):

From 99951caa3d2155a3bb84109a0828543793e088cc Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Fri, 19 Apr 2024 14:19:47 -0700
Subject: [PATCH 28/36] Fix sorting of shard optimizer states files for
 universal checkpoint (#5395)

This PR resolves the issue reported in #5283.
To resolve the issue, we sort files of sharded optimizer states based on
DP indices.

---------

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/checkpoint/ds_to_universal.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/deepspeed/checkpoint/ds_to_universal.py b/deepspeed/checkpoint/ds_to_universal.py
index d5eca81c804f..63fa866718de 100755
--- a/deepspeed/checkpoint/ds_to_universal.py
+++ b/deepspeed/checkpoint/ds_to_universal.py
@@ -132,6 +132,10 @@ def extract_zero_shards(dir, ds_checkpoint, indices_3D):
 cnt = 0
 
 
+def dp_index_to_str(dp_index):
+    return f"{dp_index:0>2d}"
+
+
 def dump_param_fragment(dir, tp_index, dp_index, state_name, state_flat_tensor, param_name, offset, numel):
 
     global cnt  # temp hack
@@ -140,9 +144,8 @@ def dump_param_fragment(dir, tp_index, dp_index, state_name, state_flat_tensor,
     os.makedirs(param_base_path, exist_ok=True)
 
     cnt += 1
-    counter = f"{dp_index:0>2d}"
 
-    path = os.path.join(param_base_path, f"{state_name}.{counter}")
+    path = os.path.join(param_base_path, f"{state_name}.{dp_index_to_str(dp_index)}")
 
     #print(f"{param_name}: {offset}: {numel} => {path}")
 
@@ -156,10 +159,21 @@ def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape):
     slices = []
     for tp_index in range(tp_degree):
         prefix_path = os.path.join(param_base_path, str(tp_index), f"{state}")
-        paths = sorted(list(glob.glob(f"{prefix_path}.*")))
+        paths = glob.glob(f"{prefix_path}.*")
+
         if len(paths) == 0:
             continue
 
+        pattern = re.compile(f"{prefix_path}\\.([0-9]+)")
+        dp_indices = set()
+        for p in paths:
+            m = pattern.match(p)
+            if m:
+                dp_indices.add(int(m.group(1)))
+            else:
+                raise ValueError(f"Cannot parse dp_rank from {p}")
+
+        paths = [f"{prefix_path}.{dp_index_to_str(dp_index)}" for dp_index in sorted(list(dp_indices))]
         shards = [torch.load(p) for p in paths]
 
         if state == "step":

From 3f875d95193fbd3a0c7f0c0dcc8d39469061bb66 Mon Sep 17 00:00:00 2001
From: shiyuan680 <72335504+shiyuan680@users.noreply.github.com>
Date: Sun, 21 Apr 2024 07:35:50 +0800
Subject: [PATCH 29/36] add device config env for the accelerator  (#5396)

Thank you for [pr](https://github.com/microsoft/DeepSpeed/pull/5369) and
@delock contribution of ideas.
As mentioned in this
[pr](https://github.com/microsoft/DeepSpeed/pull/5369), each device has
its own environmental variables.
We create visible_devices_envs() and set_visible_devices_envs() methods
on the accelerator class to enable each accelerator to implement env
settings within the interface , which is more generic to other
accelerators.

this commit has tested on npu, each one has 8 ascend npus

---------

Co-authored-by: yangcheng <yangcheng104@huawei.com>
Co-authored-by: eigen2017 <wobushiliu2@gmail.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 README.md                           | 11 ++++++-----
 accelerator/abstract_accelerator.py |  8 ++++++++
 accelerator/cpu_accelerator.py      |  8 ++++++++
 accelerator/cuda_accelerator.py     |  7 +++++++
 accelerator/hpu_accelerator.py      |  7 +++++++
 accelerator/mps_accelerator.py      |  9 +++++++++
 accelerator/npu_accelerator.py      |  7 +++++++
 accelerator/xpu_accelerator.py      |  7 +++++++
 deepspeed/launcher/launch.py        | 11 +++++++----
 9 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index a1335caa4949..201b9016f8ab 100755
--- a/README.md
+++ b/README.md
@@ -159,11 +159,12 @@ dynamically link them at runtime.
 ## Contributed HW support
 * DeepSpeed now support various HW accelerators.
 
-| Contributor | Hardware | Accelerator Name | Contributor validated | Upstream validated |
-| ----------- | -------- | ---------------- | --------------------- | ------------------ |
-| Intel | Intel(R) Gaudi(R) 2 AI accelerator | hpu | Yes | Yes |
-| Intel | Intel(R) Xeon(R) Processors | cpu | Yes | Yes |
-| Intel | Intel(R) Data Center GPU Max series | xpu | Yes | Yes |
+| Contributor | Hardware                            | Accelerator Name | Contributor validated | Upstream validated |
+|-------------|-------------------------------------|------------------| --------------------- |--------------------|
+| Huawei      | Huawei Ascend NPU                   | npu              | Yes | No                 |
+| Intel       | Intel(R) Gaudi(R) 2 AI accelerator  | hpu              | Yes | Yes                |
+| Intel       | Intel(R) Xeon(R) Processors         | cpu              | Yes | Yes                |
+| Intel       | Intel(R) Data Center GPU Max series | xpu              | Yes | Yes                |
 
 ## PyPI
 We regularly push releases to [PyPI](https://pypi.org/project/deepspeed/) and encourage users to install from there in most cases.
diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py
index 3c5d799e293e..768d5ea34e5e 100644
--- a/accelerator/abstract_accelerator.py
+++ b/accelerator/abstract_accelerator.py
@@ -287,3 +287,11 @@ def build_extension(self):
     @abc.abstractmethod
     def export_envs(self):
         ...
+
+    @abc.abstractmethod
+    def visible_devices_envs(self):
+        ...
+
+    @abc.abstractmethod
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        ...
diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
index a0171723cfb8..237e7f51dcb4 100644
--- a/accelerator/cpu_accelerator.py
+++ b/accelerator/cpu_accelerator.py
@@ -322,3 +322,11 @@ def build_extension(self):
 
     def export_envs(self):
         return []
+
+    # TODO: cpu's visible envs is confirmed, keep as CUDA_VISIBLE_DEVICES
+    def visible_devices_envs(self):
+        return ['CUDA_VISIBLE_DEVICES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
index 3d5e9c168c16..2fc0cfd94125 100644
--- a/accelerator/cuda_accelerator.py
+++ b/accelerator/cuda_accelerator.py
@@ -360,3 +360,10 @@ def build_extension(self):
 
     def export_envs(self):
         return ['NCCL']
+
+    def visible_devices_envs(self):
+        return ['CUDA_VISIBLE_DEVICES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py
index 30b115e8b1ab..326efc8fa01b 100644
--- a/accelerator/hpu_accelerator.py
+++ b/accelerator/hpu_accelerator.py
@@ -294,3 +294,10 @@ def build_extension(self):
 
     def export_envs(self):
         return []
+
+    def visible_devices_envs(self):
+        return ['HABANA_VISIBLE_MODULES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py
index 972b33caece1..ff70b860d7c7 100644
--- a/accelerator/mps_accelerator.py
+++ b/accelerator/mps_accelerator.py
@@ -258,3 +258,12 @@ def build_extension(self):
 
     def export_envs(self):
         return []
+
+    # TODO: mpu's visible envs is confirmed, keep as CUDA_VISIBLE_DEVICES
+    def visible_devices_envs(self):
+        # TODO: could not find visible devices env for mps
+        return ['CUDA_VISIBLE_DEVICES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py
index 472157e32c02..5d891ecb707d 100644
--- a/accelerator/npu_accelerator.py
+++ b/accelerator/npu_accelerator.py
@@ -278,3 +278,10 @@ def build_extension(self):
 
     def export_envs(self):
         return ['ASCEND', 'HCCL', 'LD_LIBRARY', 'PATH']
+
+    def visible_devices_envs(self):
+        return ['ASCEND_RT_VISIBLE_DEVICES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py
index 3f65263946ab..c59f60077d2f 100644
--- a/accelerator/xpu_accelerator.py
+++ b/accelerator/xpu_accelerator.py
@@ -289,3 +289,10 @@ def build_extension(self):
 
     def export_envs(self):
         return []
+
+    def visible_devices_envs(self):
+        return ['ZE_AFFINITY_MASK']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py
index ffb9cbc18fbd..079182a3b45b 100755
--- a/deepspeed/launcher/launch.py
+++ b/deepspeed/launcher/launch.py
@@ -22,6 +22,7 @@
 from collections import defaultdict
 from typing import Dict
 from argparse import ArgumentParser, REMAINDER
+from deepspeed.accelerator import get_accelerator
 from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
 from ..nebula.constants import DLTS_POD_ENV_PATH
 from ..utils import logger, get_numactl_cmd
@@ -146,8 +147,8 @@ def main():
     node_list = list(world_info.keys())
     args.nnodes = len(node_list)
     local_node = node_list[args.node_rank]
-    local_gpu_ids = world_info[local_node]
-    num_local_procs = len(local_gpu_ids)
+    local_accelerator_ids = world_info[local_node]
+    num_local_procs = len(local_accelerator_ids)
     logger.info(f"nnodes={args.nnodes}, num_local_procs={num_local_procs}, node_rank={args.node_rank}")
 
     global_rank_mapping = defaultdict(list)
@@ -161,8 +162,10 @@ def main():
             curr_global_rank += 1
     logger.info(f"global_rank_mapping={global_rank_mapping}")
     logger.info(f"dist_world_size={dist_world_size}")
-    current_env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, local_gpu_ids))
-    logger.info(f"Setting CUDA_VISIBLE_DEVICES={current_env['CUDA_VISIBLE_DEVICES']}")
+
+    get_accelerator().set_visible_devices_envs(current_env, local_accelerator_ids)
+    for env in get_accelerator().visible_devices_envs():
+        logger.info(f"Setting {env}={current_env[env]}")
 
     # set PyTorch distributed related environmental variables
     current_env["MASTER_ADDR"] = args.master_addr

From 9b6ef9e1f0d8acaefd989440b27da9069aa69207 Mon Sep 17 00:00:00 2001
From: Wei Fu <36355462+garrett4wade@users.noreply.github.com>
Date: Tue, 23 Apr 2024 03:47:00 +0800
Subject: [PATCH 30/36] 64bit indexing fused adam (#5187)

## The Issue

Applying `FusedAdam` on large tensors will cause an error `CUDA error:
an illegal memory access was encountered`.

https://github.com/microsoft/DeepSpeed/issues/3429

https://github.com/NVIDIA/apex/issues/1654

## PR Content

Following the solution in the apex repository
(https://github.com/NVIDIA/apex/pull/1765), changing indexing type to
`int64` if necessary.

---------

Co-authored-by: Michael Wyatt <michaelwyatt@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 csrc/adam/multi_tensor_adam.cu   | 77 +++++++++++++++++++++++---------
 csrc/adam/multi_tensor_apply.cuh | 10 ++---
 2 files changed, 61 insertions(+), 26 deletions(-)

diff --git a/csrc/adam/multi_tensor_adam.cu b/csrc/adam/multi_tensor_adam.cu
index d6b9b2f70710..a1fc7d15aec9 100644
--- a/csrc/adam/multi_tensor_adam.cu
+++ b/csrc/adam/multi_tensor_adam.cu
@@ -30,7 +30,7 @@ typedef enum : int {
 
 using MATH_T = float;
 
-template <typename T>
+template <typename T, typename index_t>
 struct AdamFunctor {
     __device__ __forceinline__ void operator()(int chunk_size,
                                                volatile int* noop_gmem,
@@ -48,13 +48,13 @@ struct AdamFunctor {
         // if(*noop_gmem == 1)
         //   return;
 
-        int tensor_loc = tl.block_to_tensor[blockIdx.x];
+        index_t tensor_loc = tl.block_to_tensor[blockIdx.x];
 
         // potentially use to pass in list of scalar
         // int tensor_num = tl.start_tensor_this_launch + tensor_loc;
 
-        int chunk_idx = tl.block_to_chunk[blockIdx.x];
-        int n = tl.sizes[tensor_loc];
+        index_t chunk_idx = tl.block_to_chunk[blockIdx.x];
+        index_t n = tl.sizes[tensor_loc];
 
         T* g = (T*)tl.addresses[0][tensor_loc];
         g += chunk_idx * chunk_size;
@@ -71,7 +71,8 @@ struct AdamFunctor {
         n -= chunk_idx * chunk_size;
 
         // see note in multi_tensor_scale_kernel.cu
-        for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
+        for (index_t i_start = 0; i_start < n && i_start < chunk_size;
+             i_start += blockDim.x * ILP) {
             MATH_T r_g[ILP];
             MATH_T r_p[ILP];
             MATH_T r_m[ILP];
@@ -146,23 +147,57 @@ void multi_tensor_adam_cuda(int chunk_size,
         bias_correction2 = 1 - std::pow(beta2, step);
     }
 
+    size_t max_size = 0;
+    bool requires_64bit_indexing = false;
+    for (auto it = tensor_lists.begin(); it != tensor_lists.end(); it++) {
+        for (auto it2 = it->begin(); it2 != it->end(); it2++) {
+            if (it2->numel() > max_size) {
+                max_size = it2->numel();
+                if (max_size >= INT_MAX) {
+                    requires_64bit_indexing = true;
+                    break;
+                }
+            }
+        }
+        if (requires_64bit_indexing) { break; }
+    }
+
     // Assume single type across p,g,m1,m2 now
-    DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
-                                   0,
-                                   "adam",
-                                   multi_tensor_apply<4>(BLOCK_SIZE,
-                                                         chunk_size,
-                                                         noop_flag,
-                                                         tensor_lists,
-                                                         AdamFunctor<scalar_t_0>(),
-                                                         beta1,
-                                                         beta2,
-                                                         bias_correction1,
-                                                         bias_correction2,
-                                                         epsilon,
-                                                         lr,
-                                                         (adamMode_t)mode,
-                                                         weight_decay);)
+    if (requires_64bit_indexing) {
+        DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
+                                       0,
+                                       "adam",
+                                       multi_tensor_apply<4>((int64_t)BLOCK_SIZE,
+                                                             (int64_t)chunk_size,
+                                                             noop_flag,
+                                                             tensor_lists,
+                                                             AdamFunctor<scalar_t_0, int64_t>(),
+                                                             beta1,
+                                                             beta2,
+                                                             bias_correction1,
+                                                             bias_correction2,
+                                                             epsilon,
+                                                             lr,
+                                                             (adamMode_t)mode,
+                                                             weight_decay);)
+    } else {
+        DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
+                                       0,
+                                       "adam",
+                                       multi_tensor_apply<4>(BLOCK_SIZE,
+                                                             chunk_size,
+                                                             noop_flag,
+                                                             tensor_lists,
+                                                             AdamFunctor<scalar_t_0, int32_t>(),
+                                                             beta1,
+                                                             beta2,
+                                                             bias_correction1,
+                                                             bias_correction2,
+                                                             epsilon,
+                                                             lr,
+                                                             (adamMode_t)mode,
+                                                             weight_decay);)
+    }
 
     AT_CUDA_CHECK(cudaGetLastError());
 }
diff --git a/csrc/adam/multi_tensor_apply.cuh b/csrc/adam/multi_tensor_apply.cuh
index 12f41cb49c6b..342376c141be 100644
--- a/csrc/adam/multi_tensor_apply.cuh
+++ b/csrc/adam/multi_tensor_apply.cuh
@@ -35,7 +35,7 @@ struct TensorListMetadata {
 };
 
 template <typename T, typename U, typename... ArgTypes>
-__global__ void multi_tensor_apply_kernel(int chunk_size,
+__global__ void multi_tensor_apply_kernel(int64_t chunk_size,
                                           volatile int* noop_flag,
                                           T tl,
                                           U callable,
@@ -46,8 +46,8 @@ __global__ void multi_tensor_apply_kernel(int chunk_size,
 }
 
 template <int depth, typename T, typename... ArgTypes>
-void multi_tensor_apply(int block_size,
-                        int chunk_size,
+void multi_tensor_apply(int64_t block_size,
+                        int64_t chunk_size,
                         const at::Tensor& noop_flag,
                         const std::vector<std::vector<at::Tensor>>& tensor_lists,
                         T callable,
@@ -91,9 +91,9 @@ void multi_tensor_apply(int block_size,
             tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
         loc_tensor_info++;
 
-        int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
+        auto chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
 
-        for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
+        for (auto chunk = 0; chunk < chunks_this_tensor; chunk++) {
             // std::cout << chunks_this_tensor << std::endl;
             tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
             tl.block_to_chunk[loc_block_info] = chunk;

From c292b03a403cf0f6fd747f6106ad40060c70d3f8 Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Mon, 22 Apr 2024 12:50:15 -0700
Subject: [PATCH 31/36] Improve parallel process of universal checkpoint
 conversion (#5343)

The conversion script from a regular checkpoint to the universal one
runs the followings in parallel.

1. extracts zero sharded optimizer states
2. merge the shards

However, it passes `map()` a set of only a few tasks (the number
specified as workers). Thus it needs to wait for the slowest tasks to
finish for every set.
This PR submits all the tasks to the pool and wait until the futures get
ready. We can keep all workers running.

---------

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/checkpoint/ds_to_universal.py | 34 ++++++++-----------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/deepspeed/checkpoint/ds_to_universal.py b/deepspeed/checkpoint/ds_to_universal.py
index 63fa866718de..b1a8276589b6 100755
--- a/deepspeed/checkpoint/ds_to_universal.py
+++ b/deepspeed/checkpoint/ds_to_universal.py
@@ -10,7 +10,7 @@
 import argparse
 import glob
 import itertools
-import multiprocessing
+from concurrent.futures import ProcessPoolExecutor
 import os
 import re
 import shutil
@@ -292,27 +292,18 @@ def get_matched_sub_params_pattern(name_):
     return unmatched_patterns
 
 
-def _get_chunks(l, n):
-    for i in range(0, len(l), n):
-        yield l[i:i + n]
-
-
 def _do_parallel_work(do_work, work_chunks, num_workers):
+    results = []
     if num_workers > 1:
-        pool = multiprocessing.Pool(num_workers)
-        results = []
-        for batch in tqdm.tqdm(work_chunks):
-            res = pool.map(do_work, batch)
-            results.extend(res)
-        pool.close()
-        pool.join()
+        with ProcessPoolExecutor(max_workers=num_workers) as executor:
+            future_list = [executor.submit(do_work, work) for work in work_chunks]
+            for f in tqdm.tqdm(future_list):
+                results.append(f.result())
     else:
         # No parallel pass for unit testing
         # We can't create child processes in tests
-        results = []
-        for batch in tqdm.tqdm(work_chunks):
-            res = [do_work(x) for x in batch]
-            results.extend(res)
+        for work in tqdm.tqdm(work_chunks):
+            results.append(do_work(work))
     return results
 
 
@@ -321,20 +312,15 @@ def _extract_zero_shard_files(args, ds_checkpoint, temp_dir):
         itertools.product(range(ds_checkpoint.pp_degree), range(ds_checkpoint.tp_degree),
                           range(ds_checkpoint.dp_degree)))
     #pprint(f'{_3d_range_list=}')
-    work_chunks = list(_get_chunks(_3d_range_list, args.num_extract_workers))
-    #pprint(f'{work_chunks=}')
 
-    # extract_zero_shards(temp_dir, ds_checkpoint, _3d_range_list[0])
     do_work = partial(extract_zero_shards, temp_dir, ds_checkpoint)
-    _do_parallel_work(do_work, work_chunks, args.num_extract_workers)
+    _do_parallel_work(do_work, _3d_range_list, args.num_extract_workers)
 
 
 def _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir):
-    work_chunks = list(_get_chunks(list(slice_shapes.items()), args.num_merge_workers))
-    #pprint(work_chunks)
     zero_output_folder = os.path.join(args.output_folder, "zero")
     do_work = partial(merge_tp_slices, ds_checkpoint, zero_output_folder, temp_dir, ds_checkpoint.tp_degree)
-    unmatched_patterns_lists = _do_parallel_work(do_work, work_chunks, args.num_merge_workers)
+    unmatched_patterns_lists = _do_parallel_work(do_work, list(slice_shapes.items()), args.num_merge_workers)
 
     # verify that all patterns were used
     # if a pattern was not used by any of the workers, then it was not used at all -> assert/alert

From c66bc4269e9484b6e57d6f5521df02c70d399246 Mon Sep 17 00:00:00 2001
From: inkcherry <mingzhi.liu@intel.com>
Date: Tue, 23 Apr 2024 07:27:09 +0800
Subject: [PATCH 32/36] set the default to use set_to_none for clearing
 gradients in BF16 optimizer. (#5434)

as discussed in #5175, set the default to use set_to_none for clearing
gradients in BF16 optimizer.
Additionally, for the case of zero clearing, use foreach_zero.
Verified correctness with mega-ds llama 7B training.

FYI @loadams

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/runtime/bf16_optimizer.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py
index f970e582b354..1f3365b20f4e 100644
--- a/deepspeed/runtime/bf16_optimizer.py
+++ b/deepspeed/runtime/bf16_optimizer.py
@@ -341,7 +341,7 @@ def _update_hp_grad(self, lp, group_idx, param_idx, clear_lp_grads):
 
         # clear gradients
         if clear_lp_grads:
-            lp.grad._zero()
+            lp.grad.zero_()
 
     @torch.no_grad()
     def _update_hp_grads_func(self, clear_lp_grads=False):
@@ -441,11 +441,20 @@ def clear_hp_grads(self):
             self.fp32_groups_has_gradients[i] = [False] * len(group)
 
     def clear_lp_grads(self):
+
+        # using zero_() fixed memory address for graph replay
+        set_to_none = False if self.graph_harvesting else True
+        zero_grads_list = []
         for group in self.bf16_groups:
             for param in group:
-                if param.grad is not None:
-                    # Using zero_() fixed memory address for graph replay
-                    param.grad.zero_()
+                if set_to_none:
+                    param.grad = None
+                elif param.grad is not None:
+                    if param.grad.grad_fn is not None:
+                        param.grad.detach_()
+                    zero_grads_list.append(param.grad)
+        if not set_to_none and len(zero_grads_list) > 0:
+            torch._foreach_zero_(zero_grads_list)
 
     def state_dict(self):
         state_dict = {}

From 5e6c9b931184bf8f0a245a2fef183078139b6d7e Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Tue, 23 Apr 2024 12:24:37 -0700
Subject: [PATCH 33/36] OptimizedLinear implementation (#5355)

Optimized version of `nn.Linear` that adds features such as:
      * LoRA w. base weight sharding
      * FP [6,8,12] quantization

Depends on #5336 being merged first

Co-authored-by: @rajhans
Co-authored-by: @aurickq

---------

Co-authored-by: Rajhans Samdani <rajhans.samdani@snowflake.com>
Co-authored-by: Jeff Rasley <jeff.rasley@snowflake.com>
---
 deepspeed/linear/__init__.py           |   7 ++
 deepspeed/linear/config.py             |  39 +++++++
 deepspeed/linear/optimized_linear.py   | 150 +++++++++++++++++++++++++
 deepspeed/linear/quantization.py       | 137 ++++++++++++++++++++++
 deepspeed/ops/fp_quantizer/__init__.py |   2 +-
 deepspeed/ops/fp_quantizer/quantize.py |  33 +++++-
 tests/unit/linear/test_linear.py       | 128 +++++++++++++++++++++
 tests/unit/linear/test_quant_param.py  |  58 ++++++++++
 8 files changed, 550 insertions(+), 4 deletions(-)
 create mode 100644 deepspeed/linear/__init__.py
 create mode 100644 deepspeed/linear/config.py
 create mode 100644 deepspeed/linear/optimized_linear.py
 create mode 100644 deepspeed/linear/quantization.py
 create mode 100644 tests/unit/linear/test_linear.py
 create mode 100644 tests/unit/linear/test_quant_param.py

diff --git a/deepspeed/linear/__init__.py b/deepspeed/linear/__init__.py
new file mode 100644
index 000000000000..a27f1c3eaee7
--- /dev/null
+++ b/deepspeed/linear/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .optimized_linear import OptimizedLinear
+from .config import LoRAConfig, QuantizationConfig
diff --git a/deepspeed/linear/config.py b/deepspeed/linear/config.py
new file mode 100644
index 000000000000..ae9050a3c92b
--- /dev/null
+++ b/deepspeed/linear/config.py
@@ -0,0 +1,39 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from dataclasses import dataclass
+
+
+@dataclass
+class LoRAConfig:
+    """
+    Configuration settings for LoRAOptimizedLinear.
+
+    Attributes:
+        lora_r (int): LoRA attention dimension, also know as the rank. Defaults is 64.
+        lora_alpha (float): LoRA scaling factor, default is 16.
+        base_weight_sharding (int): The degree to which the base weights are sharded,
+            should typically be set to the data-parallel world size to maximize the memory
+            reduction benefits. Defaults to 1, which means this feature is disabled.
+    """
+    lora_r: int = 64
+    lora_alpha: float = 16.
+    base_weight_sharding: int = 1
+
+
+@dataclass
+class QuantizationConfig:
+    """
+    Configuration settings for quantization for LoRAOptimizedLinear, QuantizedLinear,
+    and QuantizedParameter
+
+    Attributes:
+        q_bits (int): The number of bits used for quantization. Default is 8.
+        mantissa_bits (int): The number of bits reserved for the mantissa in fixed-point quantization. Default is 3.
+        group_size (int): The size of the group used for quantization. Default is 512.
+    """
+    q_bits: int = 8
+    mantissa_bits: int = 3
+    group_size: int = 512
diff --git a/deepspeed/linear/optimized_linear.py b/deepspeed/linear/optimized_linear.py
new file mode 100644
index 000000000000..138bd493ffc7
--- /dev/null
+++ b/deepspeed/linear/optimized_linear.py
@@ -0,0 +1,150 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import math
+import torch.nn as nn
+import torch.nn.functional as F
+from dataclasses import is_dataclass
+from deepspeed.accelerator import get_accelerator
+import deepspeed.comm as dist
+
+from .config import LoRAConfig, QuantizationConfig
+from .quantization import QuantizedParameter, QuantizedLinear
+
+
+class OptimizedLinear(nn.Module):
+    """
+    Optimized version of nn.Linear that adds features such as:
+      * LoRA w. base weight sharding
+      * FP [6,8,12] quantization
+
+    Arguments:
+        input_dim: Required: size of each input sample
+        output_dim: Required: size of each output sample
+        bias: Optional: If set to False, the layer will not learn an additive bias. Default: False
+        lora_config: Optional: LoRAConfig defining lora features and base-weight-sharding degree
+        quantization_config: Optional: QuantizationConfig defining quantization features
+        dtype: Optional: parameter dtype, only supports bfloat16 currently
+
+    Returns:
+        Returns a new nn.Module depending on the input config. Either native
+        torch.nn.Linear, QuantizedLinear, or the full-featured DSOptimizedLinear.
+    """
+
+    def __new__(self,
+                input_dim: int,
+                output_dim: int,
+                bias: bool = False,
+                lora_config: LoRAConfig = None,
+                quantization_config: QuantizationConfig = None,
+                dtype=torch.bfloat16):
+
+        if quantization_config is not None and not is_dataclass(quantization_config):
+            raise ValueError(f"Expecting QuantizationConfig but received {type(quantization_config)}")
+        if lora_config is not None and not is_dataclass(lora_config):
+            raise ValueError(f"Expecting LoRAConfig but received {type(lora_config)}")
+        if lora_config is None and quantization_config is None:
+            # Everything disabled, fall back to normal nn.Linear
+            self = nn.Linear(input_dim, output_dim, bias=bias, dtype=dtype)
+
+        elif lora_config:
+            # lora enabled, quantization may or may not be
+            self = LoRAOptimizedLinear(input_dim=input_dim,
+                                       output_dim=output_dim,
+                                       bias=bias,
+                                       lora_config=lora_config,
+                                       quantization_config=quantization_config,
+                                       dtype=dtype)
+
+        elif quantization_config:
+            # only quantization enabled, no lora
+            self = QuantizedLinear(input_dim=input_dim,
+                                   output_dim=output_dim,
+                                   bias=bias,
+                                   quantization_config=quantization_config,
+                                   dtype=dtype)
+        return self
+
+
+class LoRAOptimizedLinear(nn.Module):
+
+    def __init__(self,
+                 input_dim: int,
+                 output_dim: int,
+                 bias: bool = False,
+                 lora_config: LoRAConfig = None,
+                 quantization_config: QuantizationConfig = None,
+                 device=None,
+                 dtype=torch.bfloat16):
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.bias = bias
+        self.lora_config = lora_config
+        self.quantization_config = quantization_config
+        device = get_accelerator().current_device() if device is None else device
+        assert self.lora_config is not None, "DSOptimizedLinear requires a LoRA config"
+
+        self.zero_shards = self.lora_config.base_weight_sharding
+        self.sharded_weight_size = int(float(self.input_dim) // self.zero_shards)
+        w = torch.nn.Parameter(torch.empty((self.output_dim, self.sharded_weight_size), dtype=dtype))
+        torch.nn.init.xavier_uniform_(w)
+
+        if self.quantization_config is not None:
+            assert dtype == torch.bfloat16, "only bfloat16 is supported when using quantization"
+            self.base_weight = QuantizedParameter(w, quantization_config=quantization_config)
+        else:
+            self.base_weight = w
+
+        self.base_weight.requires_grad = False
+
+        # Use RS lora for now.
+        self.lora_scaling_factor = self.lora_config.lora_alpha / math.sqrt(self.lora_config.lora_r)
+        # Keeping lora weights in bf16 precision for ease of training.
+        self.lora_weight_1 = nn.Linear(self.input_dim,
+                                       self.lora_config.lora_r,
+                                       bias=self.bias,
+                                       device=device,
+                                       dtype=dtype)
+        self.lora_weight_2 = nn.Linear(self.lora_config.lora_r,
+                                       self.output_dim,
+                                       bias=self.bias,
+                                       device=device,
+                                       dtype=dtype)
+        self.lora_weight_1.weight.requires_grad = True
+        self.lora_weight_2.weight.requires_grad = True
+
+    def full_weight(self):
+        # This assumes weights are evenly sharded across gpus. which might not be correct.
+        # in that case, we should flatten before all_gather.
+        local_weight = self.base_weight.dequantized() if isinstance(self.base_weight,
+                                                                    QuantizedParameter) else self.base_weight
+        tensor_list = [
+            torch.zeros_like(local_weight, device=local_weight.device, dtype=local_weight.dtype)
+            for _ in range(self.zero_shards)
+        ]
+        dist.all_gather(tensor_list, local_weight)
+        weight = nn.Parameter(torch.cat([tensor for tensor in tensor_list], dim=1))
+        return weight
+
+    def linear_without_F_linear(self, input, weight):
+        output = torch.mm(input.reshape(-1, input.shape[-1]), weight)
+        output = output.view(*input.shape[:-1], weight.shape[1])
+        return output
+
+    def forward(self, input_tensor):
+        # Gather the sharded base weight
+        if self.zero_shards > 1:
+            with torch.no_grad():
+                base_weight = self.full_weight()
+        elif self.quantization_config:
+            base_weight = self.base_weight.dequantized()
+        else:
+            base_weight = self.base_weight
+
+        base_weight_output = F.linear(input_tensor, base_weight)
+        lora_output = self.lora_weight_2(self.lora_weight_1(input_tensor))
+        return base_weight_output + self.lora_scaling_factor * lora_output
diff --git a/deepspeed/linear/quantization.py b/deepspeed/linear/quantization.py
new file mode 100644
index 000000000000..f5343af45fb8
--- /dev/null
+++ b/deepspeed/linear/quantization.py
@@ -0,0 +1,137 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from typing import Optional
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.fp_quantizer import Quantizer, FP_Quantize
+from .config import QuantizationConfig
+
+
+class QuantizedParameter(nn.Parameter):
+    """
+    Quantized parameter class that implements weight quantization. Weights
+    are stored in quantized form on GPUs, and can be dequantized on-the-fly when
+    needed by the model. The weights are actually quantized during any `.to(device)`.
+
+    Arguments:
+        data (Tensor): parameter tensor.
+        requires_grad (bool, optional): if the parameter requires gradient. Defaults
+            to False and is not supported to be True. Argument provided only for interface
+            compatibility with torch.nn.Parameter.
+        quantization_config (QuantizationConfig, optional):
+        quantizer (Quantizer, optional): Defaults to FP_Quantize but can be any quantizer
+            that implements deepspeed.ops.fp_quantizer.Quantizer. This argument is also
+            required since the quantizer is stashed in the Parameter itself, some models
+            may clone the Parameter by passing an attribute __dict__. For an example, see
+            tests/unit/linear/test_quant_param.py::TestQuantParam::test_hf_clone
+    """
+
+    def __new__(
+        cls,
+        data: Optional[torch.Tensor] = None,
+        requires_grad: bool = False,  # quantized weights must be frozen
+        quantization_config: QuantizationConfig = None,
+        quantizer: Quantizer = None,
+    ):
+        if requires_grad:
+            raise ValueError(f"requires_grad=True is not supported with QuantizedParameter")
+        if data is None:
+            data = torch.empty(0)
+        self = torch.Tensor._make_subclass(cls, data, requires_grad)
+        self.quantization_config = QuantizationConfig() if quantization_config is None else quantization_config
+        if quantizer is not None:
+            self.quantizer = quantizer
+        else:
+            # if FPQuantizerBuilder is not compatible in this env this init will fail
+            self.quantizer = FP_Quantize(group_size=self.quantization_config.group_size)
+        self._ensure_quantized(self)
+        return self
+
+    def _ensure_quantized(self, tensor: torch.Tensor):
+        # If the tensor is on the accelerator and is not quantized, then quantize it in-place.
+        if get_accelerator().on_accelerator(tensor) and tensor.dtype != torch.int8:
+            with get_accelerator().stream(get_accelerator().current_stream(tensor.device)):
+                tensor.data = self.quantizer.quantize(tensor.data,
+                                                      q_bits=self.quantization_config.q_bits,
+                                                      q_mantisa_bits=self.quantization_config.mantissa_bits)
+            assert tensor.dtype == torch.int8
+
+    def dequantized(self) -> torch.Tensor:
+        """
+        Return a tensor containing the dequantized weights of this parameter.
+        """
+        if get_accelerator().on_accelerator(self.data) and self.data.dtype == torch.int8:
+            with get_accelerator().stream(get_accelerator().current_stream(self.data.device)):
+                return self.quantizer.dequantize(self.data,
+                                                 q_bits=self.quantization_config.q_bits,
+                                                 q_mantisa_bits=self.quantization_config.mantissa_bits)
+        return self.data
+
+    def __getstate__(self):
+        state = self.__dict__
+        state["data"] = self.data
+        state["quantization_config"] = self.quantization_config
+        state["requires_grad"] = self.requires_grad
+        return state
+
+    def __setstate__(self, state):
+        self.quantizer = state["quantizer"]
+        self.quantization_config = state["quantization_config"]
+        self.data = state["data"]
+        self.requires_grad = state["requires_grad"]
+
+    def __deepcopy__(self, memo):
+        new_instance = type(self).__new__(type(self))
+        state = self.__getstate__()
+        new_instance.__setstate__(state)
+        new_instance.quantizer = copy.deepcopy(state["quantizer"])
+        new_instance.quantization_config = copy.deepcopy(state["quantization_config"])
+        new_instance.data = copy.deepcopy(state["data"])
+        return new_instance
+
+    def __copy__(self):
+        new_instance = type(self).__new__(type(self))
+        state = self.__getstate__()
+        new_instance.__setstate__(state)
+        return new_instance
+
+    def cuda(self, device=None, non_blocking=False):
+        return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)
+
+    def to(self, *args, **kwargs):
+        """
+        Move the parameter to the given device. Then, if the device is a cuda device,
+        quantize it.
+        """
+        tensor = super().to(*args, **kwargs)
+        self._ensure_quantized(tensor)
+        return tensor
+
+
+class QuantizedLinear(nn.Linear):
+    """
+    Linear layer that implements weight quantization. Parameters
+    are stored via `QuantizedParameter` and are dequantized on-the-fly during any
+    forward pass.
+    """
+
+    def __init__(self,
+                 input_dim: int,
+                 output_dim: int,
+                 bias: bool = False,
+                 quantization_config: QuantizationConfig = None,
+                 dtype=torch.bfloat16):
+        super().__init__(input_dim, output_dim, bias=bias, dtype=dtype)
+        assert dtype == torch.bfloat16, "currently only supports bfloat16 dtype"
+        self.weight = QuantizedParameter(self.weight.data, quantization_config=quantization_config)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, self.weight.dequantized(), self.bias)
diff --git a/deepspeed/ops/fp_quantizer/__init__.py b/deepspeed/ops/fp_quantizer/__init__.py
index 5575f3567185..995bbae4aeaf 100644
--- a/deepspeed/ops/fp_quantizer/__init__.py
+++ b/deepspeed/ops/fp_quantizer/__init__.py
@@ -3,4 +3,4 @@
 
 # DeepSpeed Team
 
-from .quantize import FP_Quantize
+from .quantize import FP_Quantize, Quantizer
diff --git a/deepspeed/ops/fp_quantizer/quantize.py b/deepspeed/ops/fp_quantizer/quantize.py
index 0d4bf7bc6db1..f8435bda16c1 100644
--- a/deepspeed/ops/fp_quantizer/quantize.py
+++ b/deepspeed/ops/fp_quantizer/quantize.py
@@ -4,20 +4,47 @@
 # DeepSpeed Team
 
 import torch
+import abc
+from abc import ABC
 
 from deepspeed.ops.op_builder import FPQuantizerBuilder
 
 fp_quant_module = None
 
 
-class FP_Quantize:
+class Quantizer(ABC):
+    """
+    Abstract Quantizer class that implmenents quantize/dequantize methods.
+
+    Arguments:
+        group_size (int, optional): number of values or elements that are grouped
+            together for the quantization process.
+    """
+
+    def __init__(self, group_size=512) -> None:
+        self.group_size = group_size
+
+    @abc.abstractmethod
+    def quantize(self,
+                 input,
+                 q_bits=8,
+                 q_mantisa_bits=3,
+                 stochastic_mode=False,
+                 return_meta_tensor=False) -> torch.Tensor:
+        ...
+
+    @abc.abstractmethod
+    def dequantize(self, input_q, fp_out=None, q_bits=8, q_mantisa_bits=3, scale=None) -> torch.Tensor:
+        ...
+
+
+class FP_Quantize(Quantizer):
 
     def __init__(self, group_size=512) -> None:
         global fp_quant_module
+        super().__init__(group_size=group_size)
         if fp_quant_module is None:
             fp_quant_module = FPQuantizerBuilder().load()
-
-        self.group_size = group_size
         self.orig_dtype = None
 
     def quantize(self,
diff --git a/tests/unit/linear/test_linear.py b/tests/unit/linear/test_linear.py
new file mode 100644
index 000000000000..ccd26b4cd726
--- /dev/null
+++ b/tests/unit/linear/test_linear.py
@@ -0,0 +1,128 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+import deepspeed.comm as dist
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.linear import OptimizedLinear, LoRAConfig, QuantizationConfig
+from unit.common import DistributedTest
+
+from deepspeed.ops.op_builder import FPQuantizerBuilder
+
+if not deepspeed.ops.__compatible_ops__[FPQuantizerBuilder.NAME]:
+    pytest.skip("FPQuantizer op is not available on this system", allow_module_level=True)
+
+
+class TestBasicLinear(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        lora_config = None
+        quantization_config = None
+
+        input_features = 64  # Number of input features
+        output_features = 64  # Number of output features
+        batch_size = 1  # Number of samples in a batch
+
+        linear_layer = OptimizedLinear(input_dim=input_features,
+                                       output_dim=output_features,
+                                       lora_config=lora_config,
+                                       quantization_config=quantization_config,
+                                       dtype=torch.bfloat16)
+
+        dummy_input = torch.rand(batch_size, input_features, dtype=torch.bfloat16)
+        output = linear_layer(dummy_input)
+        assert output.shape == (batch_size, output_features)
+
+
+@pytest.mark.parametrize("base_weight_sharding", [1, 2])
+class TestLoRALinear(DistributedTest):
+    world_size = 2
+
+    def test(self, base_weight_sharding):
+        rank = dist.get_rank()
+        lora_config = None
+        quantization_config = None
+
+        input_features = 64  # Number of input features
+        output_features = 64  # Number of output features
+        batch_size = 5  # Number of samples in a batch
+
+        lora_config = LoRAConfig(lora_r=16, lora_alpha=16, base_weight_sharding=base_weight_sharding)
+
+        linear_layer = OptimizedLinear(input_dim=input_features,
+                                       output_dim=output_features,
+                                       lora_config=lora_config,
+                                       quantization_config=quantization_config,
+                                       dtype=torch.bfloat16)
+        device = get_accelerator().current_device_name()
+        linear_layer = linear_layer.to(device)
+        if rank == 0:
+            for n, p in linear_layer.named_parameters():
+                print(f"{n}, {p.shape}")
+
+        dummy_input = torch.rand(batch_size, input_features, device=device, dtype=torch.bfloat16)
+
+        output = linear_layer(dummy_input)
+        assert output.shape == (batch_size, output_features)
+
+
+@pytest.mark.parametrize("q_bits", [8, 6])
+class TestQuantLinear(DistributedTest):
+    world_size = 2
+
+    def test(self, q_bits):
+        rank = dist.get_rank()
+        lora_config = None
+
+        input_features = 64  # Number of input features
+        output_features = 64  # Number of output features
+        batch_size = 5  # Number of samples in a batch
+
+        lora_config = None
+        quantization_config = QuantizationConfig(q_bits=q_bits)
+
+        linear_layer = OptimizedLinear(input_dim=input_features,
+                                       output_dim=output_features,
+                                       lora_config=lora_config,
+                                       quantization_config=quantization_config,
+                                       dtype=torch.bfloat16)
+        device = get_accelerator().current_device_name()
+        linear_layer = linear_layer.to(device)
+        dummy_input = torch.rand([batch_size, input_features], device=device, dtype=torch.bfloat16)
+
+        output = linear_layer(dummy_input)
+        assert output.shape == (batch_size, output_features)
+
+
+@pytest.mark.parametrize("base_weight_sharding", [1, 2], ids=['bws1', 'bws2'])
+@pytest.mark.parametrize("q_bits", [8, 6], ids=['qbit8', 'qbit6'])
+class TestOptimizedLinear(DistributedTest):
+    world_size = 2
+
+    def test(self, base_weight_sharding, q_bits):
+        rank = dist.get_rank()
+        lora_config = None
+
+        input_features = 64  # Number of input features
+        output_features = 64  # Number of output features
+        batch_size = 5  # Number of samples in a batch
+
+        lora_config = LoRAConfig(lora_r=16, lora_alpha=16, base_weight_sharding=base_weight_sharding)
+        quantization_config = QuantizationConfig(q_bits=q_bits)
+
+        linear_layer = OptimizedLinear(input_dim=input_features,
+                                       output_dim=output_features,
+                                       lora_config=lora_config,
+                                       quantization_config=quantization_config,
+                                       dtype=torch.bfloat16)
+        device = get_accelerator().current_device_name()
+        linear_layer = linear_layer.to(device)
+        dummy_input = torch.rand([batch_size, input_features], device=device, dtype=torch.bfloat16)
+        output = linear_layer(dummy_input)
+        assert output.shape == (batch_size, output_features)
diff --git a/tests/unit/linear/test_quant_param.py b/tests/unit/linear/test_quant_param.py
new file mode 100644
index 000000000000..9479b3cba8a0
--- /dev/null
+++ b/tests/unit/linear/test_quant_param.py
@@ -0,0 +1,58 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.linear.quantization import QuantizedParameter
+from deepspeed.linear.config import QuantizationConfig
+
+from deepspeed.ops.op_builder import FPQuantizerBuilder
+
+from unit.common import DistributedTest
+
+if not deepspeed.ops.__compatible_ops__[FPQuantizerBuilder.NAME]:
+    pytest.skip("FPQuantizer op is not available on this system", allow_module_level=True)
+
+
+class TestQuantParam(DistributedTest):
+    world_size = 1
+
+    @pytest.mark.parametrize('dtype', [torch.half, torch.float])
+    def test_unsupported_dtypes(self, dtype):
+        device = get_accelerator().current_device_name()
+        data = torch.rand(5, 5, device='cpu', dtype=dtype)
+        qp = QuantizedParameter(data)
+        with pytest.raises(AssertionError):
+            qp.to(device)
+
+    def test_requires_grad(self):
+        data = torch.rand(5, 5, dtype=torch.bfloat16)
+        with pytest.raises(ValueError):
+            QuantizedParameter(data, requires_grad=True)
+
+    def test_move_to_accelerator(self):
+        device = get_accelerator().current_device()
+        data = torch.rand(5, 5, device='cpu', dtype=torch.bfloat16)
+        qp = QuantizedParameter(data)
+        assert qp.device == torch.device('cpu')
+        qp = qp.to(get_accelerator().current_device_name())
+        assert qp.device == torch.device(device)
+        assert qp.dtype == torch.int8
+
+    def test_hf_clone(self):
+        device = get_accelerator().current_device_name()
+        data = torch.rand(5, 5, device=device, dtype=torch.bfloat16)
+
+        quantization_config = QuantizationConfig(q_bits=6)
+        qp = QuantizedParameter(data, quantization_config=quantization_config)
+
+        # should be able to clone parameter via dict, HF expects this to work
+        qp_copy = QuantizedParameter(qp.data, **qp.__dict__)
+
+        assert all(qp.data == qp_copy.data)
+        assert qp.quantization_config == qp_copy.quantization_config

From ad2027952f9730cbd1a8385e4a441e470248645e Mon Sep 17 00:00:00 2001
From: Jhonso7393 <167781426+Jhonso7393@users.noreply.github.com>
Date: Tue, 23 Apr 2024 23:45:47 +0300
Subject: [PATCH 34/36] Update README.md (#5453)

Fixing a minor typo at the README file

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 201b9016f8ab..f9d81eddfdae 100755
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
 * [2024/03] [DeepSpeed-FP6:The power of FP6-Centric Serving for Large Language Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)]
-* [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
+* [2024/01] [DeepSpeed-FastGen: Introducing Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
 * [2023/11] [Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed](https://github.com/microsoft/DeepSpeed/tree/master/blogs/intel-inference) [[Intel version]](https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html)
 * [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp)
 * [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md)]

From 5f631abc2f930ecece38fae05dc9bd3923c555dd Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Tue, 23 Apr 2024 16:24:12 -0700
Subject: [PATCH 35/36] Update PyTest torch version to match PyTorch latest
 official (2.3.0) (#5454)

---
 .github/workflows/cpu-torch-latest.yml     | 4 ++--
 .github/workflows/nv-torch-latest-v100.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml
index 9c1ad02f75a6..5727ff2e1cde 100644
--- a/.github/workflows/cpu-torch-latest.yml
+++ b/.github/workflows/cpu-torch-latest.yml
@@ -50,5 +50,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.2"
-          TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.2"
+          TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.3"
+          TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.3"
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
index 3ca8ac43dfa4..2e0490c18ba7 100644
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -29,7 +29,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch==2.2.2 torchvision --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 

From fbdf0eaf15f45cd2a8b5846ad9428609a3860b41 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Tue, 23 Apr 2024 16:27:27 -0700
Subject: [PATCH 36/36] Update version.txt after 0.14.2 release (#5458)

**Auto-generated PR to update version.txt after a DeepSpeed release**
Released version - 0.14.2
Author           - @loadams

Co-authored-by: loadams <loadams@users.noreply.github.com>
---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index e867cc2a66a8..ac4a79626c87 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.14.2
+0.14.3