diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml
index bb2b002b1a174..0de6832b37c13 100644
--- a/.github/workflows/cpu-torch-latest.yml
+++ b/.github/workflows/cpu-torch-latest.yml
@@ -50,5 +50,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.4"
-          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.4"
+          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.5"
+          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.5"
diff --git a/.github/workflows/no-torch.yml b/.github/workflows/no-torch.yml
index 794c02b3c4dbf..eb3ac9b03161a 100644
--- a/.github/workflows/no-torch.yml
+++ b/.github/workflows/no-torch.yml
@@ -5,6 +5,7 @@ on:
   pull_request:
     paths:
       - '.github/workflows/no-torch.yml'
+      - 'op_builder/**'
   schedule:
     - cron: "0 0 * * *"
 
diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
index 915493bb3183c..346055e2685f4 100644
--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@@ -19,7 +19,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -29,7 +29,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml
index c823c194e83cb..2ad336cac4edc 100644
--- a/.github/workflows/nv-ds-chat.yml
+++ b/.github/workflows/nv-ds-chat.yml
@@ -26,7 +26,7 @@ permissions:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -36,7 +36,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
+          pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml
index f863226bfb954..c0a4275bd2b6b 100644
--- a/.github/workflows/nv-inference.yml
+++ b/.github/workflows/nv-inference.yml
@@ -22,7 +22,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -32,7 +32,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -58,8 +58,8 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          #pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="2.1" --cuda_ver="11.8"
-          pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="2.1" --cuda_ver="11.8"
-          pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="2.1" --cuda_ver="11.8"
+          #pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="2.1" --cuda_ver="12.1"
+          pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="2.1" --cuda_ver="12.1"
+          pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="2.1" --cuda_ver="12.1"
           # run ds_report again to check updated op list
           ds_report
diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
index 6af228f7fb2f8..044c282ba1198 100644
--- a/.github/workflows/nv-lightning-v100.yml
+++ b/.github/workflows/nv-lightning-v100.yml
@@ -19,9 +19,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu111, v100]
-
-    env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v3
@@ -31,7 +29,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml
index d394b7e24bd60..a576e5933b08d 100644
--- a/.github/workflows/nv-mii.yml
+++ b/.github/workflows/nv-mii.yml
@@ -27,7 +27,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -37,7 +37,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
+          pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml
index 8658ff5d2348a..8c56d0445c9c7 100644
--- a/.github/workflows/nv-nightly.yml
+++ b/.github/workflows/nv-nightly.yml
@@ -18,7 +18,7 @@ permissions:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -28,7 +28,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -58,7 +58,7 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.4" --cuda_ver="11.8"
+          pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.5" --cuda_ver="12.1"
 
       - name: Open GitHub issue if nightly CI fails
         if: ${{ failure() && (github.event_name == 'schedule') }}
diff --git a/.github/workflows/nv-sd.yml b/.github/workflows/nv-sd.yml
index b348d5ff931f5..0344c80451a68 100644
--- a/.github/workflows/nv-sd.yml
+++ b/.github/workflows/nv-sd.yml
@@ -53,6 +53,8 @@ jobs:
           pip install image-similarity-measures
           python -m pip install opencv-python==4.6.* --force-reinstall
           python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
+          # Update packages included in the container that do not support pydantic 2+ to versions that do
+          python -m pip install thinc spacy confection --upgrade
           python -m pip install .[dev,1bit,autotuning,sd]
           ds_report
       - name: Python environment
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
index ebef2d35c2780..0b8f504d8b5a7 100644
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -19,7 +19,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -29,7 +29,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -55,5 +55,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.4" --cuda_ver="11.8"
-          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.4" --cuda_ver="11.8"
+          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.5" --cuda_ver="12.1"
+          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.5" --cuda_ver="12.1"
diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml
index 2570404390053..74495812add5f 100644
--- a/.github/workflows/nv-torch-nightly-v100.yml
+++ b/.github/workflows/nv-torch-nightly-v100.yml
@@ -15,7 +15,7 @@ permissions:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -25,7 +25,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118
+          pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml
index cfed6d6583e6c..18c5e2c98bc6a 100644
--- a/.github/workflows/nv-transformers-v100.yml
+++ b/.github/workflows/nv-transformers-v100.yml
@@ -18,7 +18,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -29,7 +29,7 @@ jobs:
       - name: Install pytorch
         run: |
           # use the same pytorch version as transformers CI
-          pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu118 --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu121 --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/xpu-compile.yml b/.github/workflows/xpu-compile.yml
new file mode 100644
index 0000000000000..e095e089fc30f
--- /dev/null
+++ b/.github/workflows/xpu-compile.yml
@@ -0,0 +1,65 @@
+name: xpu-compile
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - ".github/workflows/xpu-compile.yml"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  compile-tests:
+    runs-on: [self-hosted, intel, xpu]
+    container:
+      image: intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
+      ports:
+        - 80
+      options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install prerequisite
+      run: |
+        apt-get update
+        apt-get install clinfo libaio-dev python3-pip -y
+        pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/
+        pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/
+        pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/
+        pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/
+        pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl
+        pip install py-cpuinfo numpy
+        pip install .[dev,autotuning]
+
+    - name: Check container state
+      run: |
+        ldd --version
+        ds_report
+        python3 -c "import torch; print('torch:', torch.__version__, torch)"
+        python3 -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
+        python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
+        pip list
+
+    - name: Compile Status
+      shell: bash
+      run: |
+        echo "# torch.compile graph breaks" >> $GITHUB_STEP_SUMMARY
+        export FI_HMEM=system
+        ulimit -n 1048575
+        cd tests/torch_compile
+        export ZE_AFFINITY_MASK=0,1
+        echo "## ZeRO stage 3" >> $GITHUB_STEP_SUMMARY
+        deepspeed test_compile.py --deepspeed_config ds_config_z3.json 2>&1 | tee log_z3.txt
+        # for each line start with 'dynamo_output', extract the second field and following fields and append to GITHUB_STEP_SUMMARY using awk
+        cat log_z3.txt | awk '/^dynamo_output/ {$1=""; print $0}' >> $GITHUB_STEP_SUMMARY
+        echo "## ZeRO stage 2" >> $GITHUB_STEP_SUMMARY
+        deepspeed test_compile.py --deepspeed_config ds_config_z2.json 2>&1 | tee log_z2.txt
+        cat log_z2.txt | awk '/^dynamo_output/ {$1=""; print $0}' >> $GITHUB_STEP_SUMMARY
diff --git a/CODEOWNERS b/CODEOWNERS
index 2c16aef39a1be..c0fc85cb8b898 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -7,50 +7,50 @@
 
 
 # top-level repo folders
-/.github/ @mrwyattii @loadams
-/azure/ @mrwyattii @awan-10
-/benchmarks/ @awan-10 @mrwyattii
-/bin/ @mrwyattii
-/csrc/ @awan-10 @mrwyattii @cmikeh2 @arashb
-/deepspeed/ @mrwyattii
-/docker/ @mrwyattii @awan-10
-/docs/ @mrwyattii
-/examples/ @awan-10 @mrwyattii
-/op_builder/ @mrwyattii @cmikeh2
-/release/ @loadams @mrwyattii
-/requirements/ @loadams @mrwyattii
-/scripts/ @mrwyattii @awan-10
-/tests/ @mrwyattii @tjruwase @loadams
+/.github/ @loadams
+/azure/ @awan-10
+/benchmarks/ @awan-10 @tjruwase
+/bin/ @loadams
+/csrc/ @awan-10
+/deepspeed/ @loadams @tjruwase
+/docker/ @awan-10
+/docs/ @loadams @tjruwase
+/examples/ @awan-10 @tohtana
+/op_builder/ @loadams @tjruwase @jomayeri
+/release/ @loadams
+/requirements/ @loadams
+/scripts/ @awan-10
+/tests/ @tjruwase @loadams @tohtana
 
 # deepspeed
-/deepspeed/autotuning/ @mrwyattii
+/deepspeed/autotuning/ @loadams
 /deepspeed/checkpoint/ @tjruwase
 /deepspeed/comm/ @awan-10
-/deepspeed/compression/ @minjiaz @xiaoxiawu-microsoft @conglongli
-/deepspeed/elasticity/ @mrwyattii @awan-10
-/deepspeed/launcher/ @mrwyattii @awan-10
-/deepspeed/module_inject/ @mrwyattii @awan-10 @cmikeh2 @arashb
-/deepspeed/moe/ @awan-10
-/deepspeed/monitor/ @awan-10 @mrwyattii
-/deepspeed/nebula/ @tjruwase @mrwyattii
-/deepspeed/ops/ @mrwyattii @awan-10 @cmikeh2 @arashb
-/deepspeed/pipe/ @ShadenSmith @duli2012
-/deepspeed/profiling/ @ShijieZZZZ
-/deepspeed/utils/ @mrwyattii @tjruwase @awan-10
+/deepspeed/compression/ @tjruwase
+/deepspeed/elasticity/ @awan-10
+/deepspeed/launcher/ @loadams
+/deepspeed/module_inject/ @awan-10
+/deepspeed/moe/ @tohtana
+/deepspeed/monitor/ @awan-10
+/deepspeed/nebula/ @tjruwase
+/deepspeed/ops/ @tohtana
+/deepspeed/pipe/ @tohtana @loadams
+/deepspeed/profiling/ @loadams
+/deepspeed/utils/ @tjruwase @awan-10
 
 # inference
-/deepspeed/inference/ @mrwyattii @awan-10 @cmikeh2 @arashb
-/deepspeed/model_implementations/ @mrwyattii @awan-10 @cmikeh2 @arashb
+/deepspeed/inference/ @awan-10
+/deepspeed/model_implementations/ @awan-10
 
 # training
-/deepspeed/runtime/ @mrwyattii @tjruwase
-/deepspeed/runtime/activation_checkpointing/ @mrwyattii @tjruwase
-/deepspeed/runtime/checkpoint_engine/ @tjruwase @mrwyattii
+/deepspeed/runtime/ @tjruwase @tohtana
+/deepspeed/runtime/activation_checkpointing/ @tjruwase
+/deepspeed/runtime/checkpoint_engine/ @tjruwase
 /deepspeed/runtime/comm/ @awan-10
-/deepspeed/runtime/compression/ @awan-10 @conglongli
-/deepspeed/runtime/data_pipeline/ @conglongli
-/deepspeed/runtime/fp16/ @mrwyattii @tjruwase
-/deepspeed/runtime/fp16/onebit/ @conglongli @awan-10
-/deepspeed/runtime/pipe/ @ShadenSmith @duli2012
-/deepspeed/runtime/swap_tensor/ @tjruwase @mrwyattii
-/deepspeed/runtime/zero/ @tjruwase @mrwyattii
+/deepspeed/runtime/compression/ @awan-10
+/deepspeed/runtime/data_pipeline/ @tjruwase
+/deepspeed/runtime/fp16/ @tjruwase
+/deepspeed/runtime/fp16/onebit/ @awan-10
+/deepspeed/runtime/pipe/ @loadams
+/deepspeed/runtime/swap_tensor/ @tjruwase
+/deepspeed/runtime/zero/ @tjruwase
diff --git a/README.md b/README.md
index 2f6661ef58603..b302e32dfd9ca 100755
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 [![PyPI version](https://badge.fury.io/py/deepspeed.svg)](https://pypi.org/project/deepspeed/)
 [![Downloads](https://static.pepy.tech/badge/deepspeed)](https://pepy.tech/project/deepspeed)
 [![Build](https://badgen.net/badge/build/check-status/blue)](#build-pipeline-status)
+[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/9530/badge)](https://www.bestpractices.dev/projects/9530)
 [![Twitter](https://img.shields.io/twitter/follow/MSFTDeepSpeed)](https://twitter.com/intent/follow?screen_name=MSFTDeepSpeed)
 [![Japanese Twitter](https://img.shields.io/badge/%E6%97%A5%E6%9C%AC%E8%AA%9ETwitter-%40MSFTDeepSpeedJP-blue)](https://twitter.com/MSFTDeepSpeedJP)
 [![Chinese Zhihu](https://img.shields.io/badge/%E7%9F%A5%E4%B9%8E-%E5%BE%AE%E8%BD%AFDeepSpeed-blue)](https://www.zhihu.com/people/deepspeed)
@@ -141,6 +142,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram
 | PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
 | Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) |
 | Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) |
+| Huawei Ascend NPU | [![Huawei Ascend NPU](https://github.com/cosdt/DeepSpeed/actions/workflows/huawei-ascend-npu.yml/badge.svg?branch=master)](https://github.com/cosdt/DeepSpeed/actions/workflows/huawei-ascend-npu.yml) |
 
 # Installation
 
diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
index d4fcbb0b1e3eb..1e4335b192928 100644
--- a/accelerator/cpu_accelerator.py
+++ b/accelerator/cpu_accelerator.py
@@ -301,9 +301,9 @@ def get_op_builder(self, class_name):
             # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
             # if successful this also means we're doing a local install and not JIT compile path
             from op_builder import __deepspeed__  # noqa: F401 # type: ignore
-            from op_builder.cpu import CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
+            from op_builder.cpu import AsyncIOBuilder, CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
         except ImportError:
-            from deepspeed.ops.op_builder.cpu import CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
+            from deepspeed.ops.op_builder.cpu import AsyncIOBuilder, CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
 
         if class_name == "CCLCommBuilder":
             return CCLCommBuilder
@@ -313,6 +313,8 @@ def get_op_builder(self, class_name):
             return FusedAdamBuilder
         elif class_name == "CPUAdamBuilder":
             return CPUAdamBuilder
+        elif class_name == "AsyncIOBuilder":
+            return AsyncIOBuilder
         else:
             # return a NotImplementedBuilder to avoid get NoneType[Name] in unit tests
             return NotImplementedBuilder
diff --git a/accelerator/mlu_accelerator.py b/accelerator/mlu_accelerator.py
new file mode 100644
index 0000000000000..bef716f0ee4e4
--- /dev/null
+++ b/accelerator/mlu_accelerator.py
@@ -0,0 +1,300 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import importlib
+import inspect
+import functools
+
+from .abstract_accelerator import DeepSpeedAccelerator
+import torch
+# During setup stage torch may not be installed, pass on no torch will
+# allow op builder related API to be executed.
+
+
+class MLU_Accelerator(DeepSpeedAccelerator):
+
+    def __init__(self):
+        self._name = 'mlu'
+        self._communication_backend_name = 'cncl'
+        self._compile_backend = "inductor"
+        self.class_dict = None
+
+    def is_synchronized_device(self):
+        return False
+
+    def use_host_timers(self):
+        return self.is_synchronized_device()
+
+    def resolves_data_dependency(self):
+        return self.is_synchronized_device()
+
+    def handles_memory_backpressure(self):
+        return self.is_synchronized_device()
+
+    # Device APIs
+    def device_name(self, device_index=None):
+        if device_index == None:
+            return 'mlu'
+        return 'mlu:{}'.format(device_index)
+
+    def device(self, device_index=None):
+        return torch.mlu.device(device_index)
+
+    def set_device(self, device_index):
+        torch.mlu.set_device(device_index)
+
+    def current_device(self):
+        return torch.mlu.current_device()
+
+    def current_device_name(self):
+        return 'mlu:{}'.format(torch.mlu.current_device())
+
+    def device_count(self):
+        return torch.mlu.device_count()
+
+    def synchronize(self, device_index=None):
+        return torch.mlu.synchronize(device_index)
+
+    # RNG APIs
+    def random(self):
+        return torch.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        if device_index is None:
+            return torch.mlu.set_rng_state(new_state)
+
+        return torch.mlu.set_rng_state(new_state, device_index)
+
+    def get_rng_state(self, device_index=None):
+        if device_index is None:
+            return torch.mlu.get_rng_state()
+
+        return torch.mlu.get_rng_state(device_index)
+
+    def manual_seed(self, seed):
+        return torch.mlu.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        return torch.mlu.manual_seed_all(seed)
+
+    def initial_seed(self, seed):
+        return torch.mlu.initial_seed(seed)
+
+    def default_generator(self, device_index):
+        return torch.mlu.default_generators[device_index]
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return torch.mlu.Stream
+
+    def stream(self, stream):
+        return torch.mlu.stream(stream)
+
+    def current_stream(self, device_index=None):
+        return torch.mlu.current_stream(device_index)
+
+    def default_stream(self, device_index=None):
+        return torch.mlu.default_stream(device_index)
+
+    @property
+    def Event(self):
+        return torch.mlu.Event
+
+    # Memory management
+    def empty_cache(self):
+        return torch.mlu.empty_cache()
+
+    def memory_allocated(self, device_index=None):
+        return torch.mlu.memory_allocated(device_index)
+
+    def max_memory_allocated(self, device_index=None):
+        return torch.mlu.max_memory_allocated(device_index)
+
+    def reset_max_memory_allocated(self, device_index=None):
+        return torch.mlu.reset_max_memory_allocated(device_index)
+
+    def memory_cached(self, device_index=None):
+        return torch.mlu.memory_cached(device_index)
+
+    def max_memory_cached(self, device_index=None):
+        return torch.mlu.max_memory_cached(device_index)
+
+    def reset_max_memory_cached(self, device_index=None):
+        return torch.mlu.reset_max_memory_cached(device_index)
+
+    def memory_stats(self, device_index=None):
+        if hasattr(torch.mlu, 'memory_stats'):
+            return torch.mlu.memory_stats(device_index)
+
+    def reset_peak_memory_stats(self, device_index=None):
+        if hasattr(torch.mlu, 'reset_peak_memory_stats'):
+            return torch.mlu.reset_peak_memory_stats(device_index)
+
+    def memory_reserved(self, device_index=None):
+        if hasattr(torch.mlu, 'memory_reserved'):
+            return torch.mlu.memory_reserved(device_index)
+
+    def max_memory_reserved(self, device_index=None):
+        if hasattr(torch.mlu, 'max_memory_reserved'):
+            return torch.mlu.max_memory_reserved(device_index)
+
+    def total_memory(self, device_index=None):
+        return torch.mlu.get_device_properties(device_index).total_memory
+
+    def available_memory(self, device_index=None):
+        return self.total_memory(device_index) - self.memory_allocated(device_index)
+
+    # Data types
+    def is_bf16_supported(self):
+        return torch.mlu.is_bf16_supported()
+
+    def is_fp16_supported(self):
+        return True
+
+    def supported_dtypes(self):
+        supported_dtypes = [torch.float]
+        if self.is_fp16_supported():
+            supported_dtypes.append(torch.half)
+        if self.is_bf16_supported():
+            supported_dtypes.append(torch.bfloat16)
+        return supported_dtypes
+
+    # Misc
+    def amp(self):
+        if hasattr(torch.mlu, 'amp'):
+            return torch.mlu.amp
+        return None
+
+    def is_available(self):
+        return torch.mlu.is_available()
+
+    def range_push(self, msg):
+        if hasattr(torch.mlu.cnpx, 'range_push'):
+            return torch.mlu.cnpx.range_push(msg)
+
+    def range_pop(self):
+        if hasattr(torch.mlu.cnpx, 'range_pop'):
+            return torch.mlu.cnpx.range_pop()
+
+    def lazy_call(self, callback):
+        return torch.mlu._lazy_call(callback)
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    def is_triton_supported(self):
+        return True
+
+    # Graph operations
+    def create_graph(self):
+        torch.mlu.MLUGraph()
+
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        return torch.mlu.graph(graph, pool, stream)
+
+    def replay_graph(self, graph):
+        graph.replay()
+        return
+
+    # Tensor operations
+
+    @property
+    def BFloat16Tensor(self):
+        return functools.partial(torch.tensor, dtype=torch.bfloat16, device='mlu')
+
+    @property
+    def ByteTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.uint8, device='mlu')
+
+    @property
+    def DoubleTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.double, device='mlu')
+
+    @property
+    def FloatTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.float, device='mlu')
+
+    @property
+    def HalfTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.half, device='mlu')
+
+    @property
+    def IntTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.int, device='mlu')
+
+    @property
+    def LongTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.long, device='mlu')
+
+    def pin_memory(self, tensor):
+        return tensor.pin_memory()
+
+    def is_pinned(self, tensor):
+        return tensor.is_pinned()
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith('mlu:'):
+            return True
+        else:
+            return False
+
+    def op_builder_dir(self):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+            return "op_builder.mlu"
+        except ImportError:
+            return "deepspeed.ops.op_builder.mlu"
+
+    def _lazy_init_class_dict(self):
+        if self.class_dict:
+            return
+
+        op_builder_module = importlib.import_module(self.op_builder_dir())
+
+        # get op builder class from op_builder/mlu/__init__.py
+        self.class_dict = {}
+        for class_name, class_obj in inspect.getmembers(op_builder_module, inspect.isclass):
+            self.class_dict[class_name] = class_obj
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, class_name):
+        builder_class = self.get_op_builder(class_name)
+        return builder_class()
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return self.class_dict['NotImplementedBuilder']
+
+    def build_extension(self):
+        from torch.utils.cpp_extension import BuildExtension
+        return BuildExtension
+
+    def export_envs(self):
+        return ['NEUWARE_HOME', 'CNCL', 'LD_LIBRARY', 'PATH']
+
+    def visible_devices_envs(self):
+        return ['MLU_VISIBLE_DEVICES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends }")
diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
index 037162e867ecc..69e96d285bb8a 100644
--- a/accelerator/real_accelerator.py
+++ b/accelerator/real_accelerator.py
@@ -20,7 +20,7 @@
 except ImportError as e:
     dsa2 = None
 
-SUPPORTED_ACCELERATOR_LIST = ['cuda', 'cpu', 'xpu', 'xpu.external', 'npu', 'mps', 'hpu']
+SUPPORTED_ACCELERATOR_LIST = ['cuda', 'cpu', 'xpu', 'xpu.external', 'npu', 'mps', 'hpu', 'mlu']
 
 ds_accelerator = None
 
@@ -94,6 +94,11 @@ def get_accelerator():
             except ImportError as e:
                 raise ValueError(
                     f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
+        elif accelerator_name == "mlu":
+            try:
+                import torch_mlu  # noqa: F401
+            except ImportError as e:
+                raise ValueError(f"MLU_Accelerator requires torch_mlu, which is not installed on this system.")
         elif accelerator_name not in SUPPORTED_ACCELERATOR_LIST:
             raise ValueError(f'DS_ACCELERATOR must be one of {SUPPORTED_ACCELERATOR_LIST}. '
                              f'Value "{accelerator_name}" is not supported')
@@ -149,6 +154,13 @@ def get_accelerator():
                 accelerator_name = "hpu"
             except ImportError as e:
                 pass
+        if accelerator_name is None:
+            try:
+                import torch_mlu  # noqa: F401,F811
+
+                accelerator_name = "mlu"
+            except ImportError as e:
+                pass
         if accelerator_name is None:
             # borrow this log from PR#5084
             try:
@@ -198,6 +210,10 @@ def get_accelerator():
         from .hpu_accelerator import HPU_Accelerator
 
         ds_accelerator = HPU_Accelerator()
+    elif accelerator_name == 'mlu':
+        from .mlu_accelerator import MLU_Accelerator
+
+        ds_accelerator = MLU_Accelerator()
     _validate_accelerator(ds_accelerator)
     if accel_logger is not None:
         accel_logger.info(f"Setting ds_accelerator to {ds_accelerator._name} ({ds_set_method})")
diff --git a/bin/ds_io b/bin/ds_io
new file mode 100644
index 0000000000000..681fd634764c4
--- /dev/null
+++ b/bin/ds_io
@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+
+from deepspeed.nvme import ds_io_main
+
+if __name__ == '__main__':
+    ds_io_main()
diff --git a/bin/ds_nvme_tune b/bin/ds_nvme_tune
new file mode 100644
index 0000000000000..117adfba22c08
--- /dev/null
+++ b/bin/ds_nvme_tune
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+
+from deepspeed.nvme import sweep_main, generate_main, parse_sweep_arguments
+
+if __name__ == '__main__':
+    args = parse_sweep_arguments()
+    print(f"Running DeepNVMe performance tuning on {args.nvme_dir}")
+    sweep_main(args)
+    generate_main(args.log_dir)
diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp
index a65cc500cc821..81c315e9a5583 100644
--- a/csrc/aio/common/deepspeed_aio_common.cpp
+++ b/csrc/aio/common/deepspeed_aio_common.cpp
@@ -68,8 +68,8 @@ static void _get_aio_latencies(std::vector<std::chrono::duration<double>>& raw_l
         std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
 }
 
-static void _do_io_submit_singles(const long long int n_iocbs,
-                                  const long long int iocb_index,
+static void _do_io_submit_singles(const int64_t n_iocbs,
+                                  const int64_t iocb_index,
                                   std::unique_ptr<aio_context>& aio_ctxt,
                                   std::vector<std::chrono::duration<double>>& submit_times)
 {
@@ -89,8 +89,8 @@ static void _do_io_submit_singles(const long long int n_iocbs,
     }
 }
 
-static void _do_io_submit_block(const long long int n_iocbs,
-                                const long long int iocb_index,
+static void _do_io_submit_block(const int64_t n_iocbs,
+                                const int64_t iocb_index,
                                 std::unique_ptr<aio_context>& aio_ctxt,
                                 std::vector<std::chrono::duration<double>>& submit_times)
 {
@@ -109,18 +109,18 @@ static void _do_io_submit_block(const long long int n_iocbs,
     assert(submit_ret > 0);
 }
 
-static int _do_io_complete(const long long int min_completes,
-                           const long long int max_completes,
+static int _do_io_complete(const int64_t min_completes,
+                           const int64_t max_completes,
                            std::unique_ptr<aio_context>& aio_ctxt,
                            std::vector<std::chrono::duration<double>>& reap_times)
 {
     const auto start_time = std::chrono::high_resolution_clock::now();
-    long long int n_completes = io_pgetevents(aio_ctxt->_io_ctxt,
-                                              min_completes,
-                                              max_completes,
-                                              aio_ctxt->_io_events.data(),
-                                              nullptr,
-                                              nullptr);
+    int64_t n_completes = io_pgetevents(aio_ctxt->_io_ctxt,
+                                        min_completes,
+                                        max_completes,
+                                        aio_ctxt->_io_events.data(),
+                                        nullptr,
+                                        nullptr);
     reap_times.push_back(std::chrono::high_resolution_clock::now() - start_time);
     assert(n_completes >= min_completes);
     return n_completes;
@@ -134,7 +134,7 @@ void do_aio_operation_sequential(const bool read_op,
 {
     struct io_prep_context prep_ctxt(read_op, xfer_ctxt, aio_ctxt->_block_size, &aio_ctxt->_iocbs);
 
-    const auto num_io_blocks = static_cast<long long int>(
+    const auto num_io_blocks = static_cast<int64_t>(
         ceil(static_cast<double>(xfer_ctxt->_num_bytes) / aio_ctxt->_block_size));
 #if DEBUG_DS_AIO_PERF
     const auto io_op_name = std::string(read_op ? "read" : "write");
@@ -145,15 +145,14 @@ void do_aio_operation_sequential(const bool read_op,
     std::vector<std::chrono::duration<double>> submit_times;
     std::vector<std::chrono::duration<double>> reap_times;
     const auto max_queue_bytes =
-        static_cast<long long int>(aio_ctxt->_queue_depth * aio_ctxt->_block_size);
+        static_cast<int64_t>(aio_ctxt->_queue_depth * aio_ctxt->_block_size);
 
     auto start = std::chrono::high_resolution_clock::now();
-    for (long long iocb_index = 0; iocb_index < num_io_blocks;
-         iocb_index += aio_ctxt->_queue_depth) {
+    for (int64_t iocb_index = 0; iocb_index < num_io_blocks; iocb_index += aio_ctxt->_queue_depth) {
         const auto start_offset = iocb_index * aio_ctxt->_block_size;
         const auto start_buffer = (char*)xfer_ctxt->_mem_buffer + start_offset;
         const auto n_iocbs =
-            min(static_cast<long long>(aio_ctxt->_queue_depth), (num_io_blocks - iocb_index));
+            min(static_cast<int64_t>(aio_ctxt->_queue_depth), (num_io_blocks - iocb_index));
         const auto num_bytes = min(max_queue_bytes, (xfer_ctxt->_num_bytes - start_offset));
         prep_ctxt.prep_iocbs(n_iocbs, num_bytes, start_buffer, start_offset);
 
@@ -285,13 +284,13 @@ int open_file(const char* filename, const bool read_op)
 
 int regular_read(const char* filename, std::vector<char>& buffer)
 {
-    long long int num_bytes;
+    int64_t num_bytes;
     const auto f_size = get_file_size(filename, num_bytes);
     assert(f_size != -1);
     buffer.resize(num_bytes);
     const auto fd = open(filename, O_RDONLY, 0600);
     assert(fd != -1);
-    long long int read_bytes = 0;
+    int64_t read_bytes = 0;
     auto r = 0;
     do {
         const auto buffer_ptr = buffer.data() + read_bytes;
@@ -309,7 +308,7 @@ int regular_read(const char* filename, std::vector<char>& buffer)
     return 0;
 }
 
-static bool _validate_buffer(const char* filename, void* aio_buffer, const long long int num_bytes)
+static bool _validate_buffer(const char* filename, void* aio_buffer, const int64_t num_bytes)
 {
     std::vector<char> regular_buffer;
     const auto reg_ret = regular_read(filename, regular_buffer);
@@ -317,7 +316,7 @@ static bool _validate_buffer(const char* filename, void* aio_buffer, const long
     std::cout << "regular read of " << filename << " returned " << regular_buffer.size() << " bytes"
               << std::endl;
 
-    if (static_cast<long long int>(regular_buffer.size()) != num_bytes) { return false; }
+    if (static_cast<int64_t>(regular_buffer.size()) != num_bytes) { return false; }
 
     return (0 == memcmp(aio_buffer, regular_buffer.data(), regular_buffer.size()));
 }
@@ -325,7 +324,7 @@ static bool _validate_buffer(const char* filename, void* aio_buffer, const long
 bool validate_aio_operation(const bool read_op,
                             const char* filename,
                             void* aio_buffer,
-                            const long long int num_bytes)
+                            const int64_t num_bytes)
 {
     const auto msg_suffix = std::string("deepspeed_aio_") +
                             std::string(read_op ? "read()" : "write()") +
diff --git a/csrc/aio/common/deepspeed_aio_common.h b/csrc/aio/common/deepspeed_aio_common.h
index 2940de945ee8f..aa4e49f4f4ede 100644
--- a/csrc/aio/common/deepspeed_aio_common.h
+++ b/csrc/aio/common/deepspeed_aio_common.h
@@ -35,4 +35,4 @@ int regular_read(const char* filename, std::vector<char>& buffer);
 bool validate_aio_operation(const bool read_op,
                             const char* filename,
                             void* aio_buffer,
-                            const long long int num_bytes);
+                            const int64_t num_bytes);
diff --git a/csrc/aio/common/deepspeed_aio_utils.cpp b/csrc/aio/common/deepspeed_aio_utils.cpp
index 763b2c253a34c..0536ff6a362e7 100644
--- a/csrc/aio/common/deepspeed_aio_utils.cpp
+++ b/csrc/aio/common/deepspeed_aio_utils.cpp
@@ -18,8 +18,8 @@ const int c_block_size = 128 * 1024;
 const int c_io_queue_depth = 8;
 
 io_xfer_ctxt::io_xfer_ctxt(const int fd,
-                           const long long int file_offset,
-                           const long long int num_bytes,
+                           const int64_t file_offset,
+                           const int64_t num_bytes,
                            const void* buffer)
     : _fd(fd), _base_offset(file_offset), _mem_buffer(buffer), _num_bytes(num_bytes)
 {
@@ -36,7 +36,7 @@ io_prep_context::io_prep_context(const bool read_op,
 void io_prep_context::prep_iocbs(const int n_iocbs,
                                  const size_t num_bytes,
                                  const void* start_buffer,
-                                 const long long int start_offset)
+                                 const int64_t start_offset)
 {
     assert(static_cast<size_t>(n_iocbs) <= _iocbs->size());
     for (auto i = 0; i < n_iocbs; ++i) {
@@ -64,24 +64,24 @@ io_prep_generator::io_prep_generator(const bool read_op,
       _next_iocb_index(0)
 {
     _num_io_blocks =
-        static_cast<long long int>(ceil(static_cast<double>(xfer_ctxt->_num_bytes) / block_size));
+        static_cast<int64_t>(ceil(static_cast<double>(xfer_ctxt->_num_bytes) / block_size));
     _remaining_io_blocks = _num_io_blocks;
 }
 
 int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs)
 {
     if ((_remaining_bytes) == 0 || (_remaining_io_blocks == 0)) {
-        assert(static_cast<long long int>(_remaining_bytes) == _remaining_io_blocks);
+        assert(static_cast<int64_t>(_remaining_bytes) == _remaining_io_blocks);
         return 0;
     }
 
     assert(static_cast<size_t>(n_iocbs) <= iocbs->size());
 
-    auto actual_n_iocbs = min(static_cast<long long int>(n_iocbs), _remaining_io_blocks);
+    auto actual_n_iocbs = min(static_cast<int64_t>(n_iocbs), _remaining_io_blocks);
     for (auto i = 0; i < actual_n_iocbs; ++i, ++_next_iocb_index) {
         const auto xfer_offset = _xfer_ctxt->_base_offset + (_next_iocb_index * _block_size);
         const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + xfer_offset;
-        const auto num_bytes = min(static_cast<long long int>(_block_size), _remaining_bytes);
+        const auto num_bytes = min(static_cast<int64_t>(_block_size), _remaining_bytes);
 
         if (_read_op) {
             io_prep_pread(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
@@ -95,7 +95,7 @@ int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>*
     return actual_n_iocbs;
 }
 
-int get_file_size(const char* filename, long long int& size)
+int get_file_size(const char* filename, int64_t& size)
 {
     struct stat st;
     if (stat(filename, &st) == -1) { return -1; }
@@ -103,7 +103,7 @@ int get_file_size(const char* filename, long long int& size)
     return 0;
 }
 
-void* ds_page_aligned_alloc(const size_t size, const bool lock)
+void* ds_page_aligned_alloc(const int64_t size, const bool lock)
 {
     void* ptr;
     int retval;
diff --git a/csrc/aio/common/deepspeed_aio_utils.h b/csrc/aio/common/deepspeed_aio_utils.h
index 9c58c2286610b..20e81fe8eebd7 100644
--- a/csrc/aio/common/deepspeed_aio_utils.h
+++ b/csrc/aio/common/deepspeed_aio_utils.h
@@ -30,13 +30,13 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 struct io_xfer_ctxt {
     const int _fd;
-    const long long int _base_offset;
+    const int64_t _base_offset;
     const void* _mem_buffer;
-    const long long int _num_bytes;
+    const int64_t _num_bytes;
 
     io_xfer_ctxt(const int fd,
-                 const long long int file_offset,
-                 const long long int num_bytes,
+                 const int64_t file_offset,
+                 const int64_t num_bytes,
                  const void* buffer);
 };
 
@@ -54,7 +54,7 @@ struct io_prep_context {
     void prep_iocbs(const int n_iocbs,
                     const size_t num_bytes,
                     const void* start_buffer,
-                    const long long int start_offset);
+                    const int64_t start_offset);
 };
 
 struct io_prep_generator {
@@ -62,10 +62,10 @@ struct io_prep_generator {
     const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
     const size_t _block_size;
 
-    long long int _remaining_bytes;
-    long long int _num_io_blocks;
-    long long int _remaining_io_blocks;
-    long long int _next_iocb_index;
+    int64_t _remaining_bytes;
+    int64_t _num_io_blocks;
+    int64_t _remaining_io_blocks;
+    int64_t _next_iocb_index;
 
     io_prep_generator(const bool read_op,
                       const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
@@ -74,6 +74,6 @@ struct io_prep_generator {
     int prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs);
 };
 
-void* ds_page_aligned_alloc(const size_t size, const bool lock = false);
+void* ds_page_aligned_alloc(const int64_t size, const bool lock = false);
 
-int get_file_size(const char* filename, long long int& size);
+int get_file_size(const char* filename, int64_t& size);
diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
index dc820be528d05..6f311c5400c72 100644
--- a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
@@ -11,16 +11,16 @@ io_op_desc_t::io_op_desc_t(const bool read_op,
                            const torch::Tensor& buffer,
                            const int fd,
                            const char* filename,
-                           const long long int file_num_bytes,
-                           const int num_threads,
+                           const int64_t file_num_bytes,
+                           const int intra_op_parallelism,
                            const bool validate)
     : _read_op(read_op),
       _buffer(buffer),
       _fd(fd),
       _filename(filename),
       _file_num_bytes(file_num_bytes),
-      _num_threads(num_threads),
-      _num_bytes_per_thread(file_num_bytes / num_threads),
+      _intra_op_parallelism(intra_op_parallelism),
+      _num_bytes_per_thread(file_num_bytes / intra_op_parallelism),
       _validate(validate)
 {
 }
diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.h b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
index 350d28d29d58e..f841b8ce520a5 100644
--- a/csrc/aio/py_lib/deepspeed_aio_op_desc.h
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
@@ -14,9 +14,9 @@ struct io_op_desc_t {
     torch::Tensor _buffer;
     int _fd;
     const std::string _filename;
-    const long long int _file_num_bytes;
-    const int _num_threads;
-    const long long int _num_bytes_per_thread;
+    const int64_t _file_num_bytes;
+    const int _intra_op_parallelism;
+    const int64_t _num_bytes_per_thread;
     torch::Tensor _contiguous_buffer;
     const bool _validate;
 
@@ -24,8 +24,8 @@ struct io_op_desc_t {
                  const torch::Tensor& buffer,
                  const int fd,
                  const char* filename,
-                 const long long int file_num_bytes,
-                 const int num_threads,
+                 const int64_t file_num_bytes,
+                 const int intra_op_parallelism,
                  const bool validate);
 
     virtual void run(const int tid,
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
index 41790b99bb889..da2ff568d74bf 100644
--- a/csrc/aio/py_lib/deepspeed_cpu_op.cpp
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
@@ -4,31 +4,30 @@
 // DeepSpeed Team
 
 #include "deepspeed_cpu_op.h"
+#include "deepspeed_pin_tensor.h"
 
 using namespace std;
 
-cpu_op_desc_t::cpu_op_desc_t(const bool read_op,
-                             const torch::Tensor& buffer,
-                             const int fd,
-                             const char* filename,
-                             const long long int file_num_bytes,
-                             const int num_threads,
-                             const bool validate)
-    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate),
-      _cpu_buffer(buffer)
+cpu_op_desc_t::cpu_op_desc_t(
+    const bool read_op,
+    const torch::Tensor& buffer,
+    const std::unique_ptr<struct deepspeed_pin_tensor_t>& pinned_tensor_mgr,
+    const int fd,
+    const char* filename,
+    const int64_t file_num_bytes,
+    const int intra_op_parallelism,
+    const bool validate)
+    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, intra_op_parallelism, validate),
+      _cpu_buffer(buffer),
+      _pinned_tensor_mgr(pinned_tensor_mgr),
+      _is_managed_bounce_buffer(false)
 {
     // Need to use CPU bounce buffer if buffer is not a page-locked DRAM memory.
-    _use_bounce_buffer = !(_buffer.is_cpu() && _buffer.is_pinned());
+    _use_bounce_buffer =
+        !(_buffer.is_cpu() && (_buffer.is_pinned() || _pinned_tensor_mgr->is_managed(_buffer)));
     if (_use_bounce_buffer) {
-        if (_read_op) {
-            auto options = torch::TensorOptions()
-                               .dtype(_buffer.dtype())
-                               .layout(_buffer.layout())
-                               .device(torch::kCPU);
-            _cpu_buffer = torch::empty(_buffer.nbytes(), options).pin_memory();
-        } else {
-            _cpu_buffer = _buffer.to(torch::kCPU).pin_memory();
-        }
+        _alloc_bounce_buffer();
+        if (!_read_op) { _cpu_buffer.copy_(_buffer); }
     }
     _contiguous_buffer = _cpu_buffer.contiguous();
 }
@@ -37,15 +36,23 @@ char* cpu_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_pt
 
 void cpu_op_desc_t::finish()
 {
-    if (_read_op) {
-        if (_buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
-        if (_buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); }
+    if (_use_bounce_buffer) {
+        if (_read_op) {
+            if (_buffer.is_cuda()) {
+                _buffer.copy_(_cpu_buffer.to(torch::Device(torch::kCUDA, _buffer.get_device()),
+                                             /*non_blocking=*/true));
+            }
+            if (_buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); }
+            if (_buffer.is_cpu()) { _buffer.copy_(_cpu_buffer); }
 #if defined(__ENABLE_CANN__)
-        if (torch_npu::utils::is_npu(_buffer)) {
-            auto device = at::Device("npu:0");
-            _buffer.copy_(_cpu_buffer.to(device));
-        }
+            if (torch_npu::utils::is_npu(_buffer)) {
+                auto device = at::Device("npu:0");
+                _buffer.copy_(_cpu_buffer.to(device));
+            }
 #endif
+        }
+
+        _free_bounce_buffer();
     }
 }
 
@@ -58,7 +65,7 @@ void cpu_op_desc_t::run(const int tid,
                         std::unique_ptr<aio_context>& aio_ctxt,
                         deepspeed_aio_config_t* aio_config)
 {
-    assert(tid < _num_threads);
+    assert(tid < _intra_op_parallelism);
     const auto base_offset = _num_bytes_per_thread * tid;
 
     std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
@@ -70,3 +77,24 @@ void cpu_op_desc_t::run(const int tid,
         do_aio_operation_sequential(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr);
     }
 }
+
+void cpu_op_desc_t::_alloc_bounce_buffer()
+{
+    auto options = torch::TensorOptions()
+                       .dtype(_buffer.dtype())
+                       .layout(_buffer.layout())
+                       .device(torch::kCPU)
+                       .requires_grad(false);
+
+#if defined(__CUDA_ARCH__)
+    _cpu_buffer = torch::empty(_buffer.numel(), options).pin_memory();
+#else
+    _is_managed_bounce_buffer = true;
+    _cpu_buffer = _pinned_tensor_mgr->alloc(_buffer.numel(), options);
+#endif
+}
+
+void cpu_op_desc_t::_free_bounce_buffer()
+{
+    if (_is_managed_bounce_buffer) { _pinned_tensor_mgr->free(_cpu_buffer); }
+}
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.h b/csrc/aio/py_lib/deepspeed_cpu_op.h
index da96dd2b1d50b..9de2fa2540486 100644
--- a/csrc/aio/py_lib/deepspeed_cpu_op.h
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.h
@@ -10,13 +10,16 @@
 struct cpu_op_desc_t : io_op_desc_t {
     torch::Tensor _cpu_buffer;
     bool _use_bounce_buffer;
+    bool _is_managed_bounce_buffer;
+    const std::unique_ptr<struct deepspeed_pin_tensor_t>& _pinned_tensor_mgr;
 
     cpu_op_desc_t(const bool read_op,
                   const torch::Tensor& buffer,
+                  const std::unique_ptr<struct deepspeed_pin_tensor_t>& pinned_tensor_mgr,
                   const int fd,
                   const char* filename,
-                  const long long int file_num_bytes,
-                  const int num_threads,
+                  const int64_t file_num_bytes,
+                  const int intra_op_parallelism,
                   const bool validate);
 
     void run(const int tid,
@@ -28,4 +31,7 @@ struct cpu_op_desc_t : io_op_desc_t {
     void validate();
 
     void finish();
+
+    void _alloc_bounce_buffer();
+    void _free_bounce_buffer();
 };
diff --git a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
index 752823dc7dd2c..a97a4ac18ba81 100644
--- a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
@@ -15,21 +15,28 @@ deepspeed_pin_tensor_t::~deepspeed_pin_tensor_t()
 {
     for (auto iter = _locked_tensors.begin(); iter != _locked_tensors.end(); ++iter) {
         munlock(iter->first, iter->second);
+        std::free((void*)iter->first);
     }
     _locked_tensors.clear();
 }
 
-torch::Tensor deepspeed_pin_tensor_t::alloc(const size_t num_elem, const at::ScalarType& elem_type)
+torch::Tensor deepspeed_pin_tensor_t::alloc(const int64_t num_elem,
+                                            const torch::TensorOptions& options)
 {
-    const auto num_bytes = num_elem * elementSize(elem_type);
+    const auto scalar_dtype = torch::typeMetaToScalarType(options.dtype());
+    const auto num_bytes = num_elem * torch::elementSize(scalar_dtype);
     auto pinned_buffer = ds_page_aligned_alloc(num_bytes, true);
     assert(nullptr != pinned_buffer);
 
     _locked_tensors[pinned_buffer] = num_bytes;
 
-    auto options = torch::TensorOptions().dtype(elem_type).device(torch::kCPU);
+    return at::from_blob(pinned_buffer, static_cast<int64_t>(num_elem), options);
+}
 
-    return at::from_blob(pinned_buffer, static_cast<long int>(num_bytes), options);
+torch::Tensor deepspeed_pin_tensor_t::alloc(const int64_t num_elem, const at::ScalarType& elem_type)
+{
+    auto options = torch::TensorOptions().dtype(elem_type).device(torch::kCPU).requires_grad(false);
+    return alloc(num_elem, options);
 }
 
 bool deepspeed_pin_tensor_t::free(torch::Tensor& locked_tensor)
@@ -37,9 +44,18 @@ bool deepspeed_pin_tensor_t::free(torch::Tensor& locked_tensor)
     auto addr = locked_tensor.data_ptr();
     if (_locked_tensors.find(addr) != _locked_tensors.end()) {
         munlock(addr, _locked_tensors[addr]);
+        std::free(addr);
         _locked_tensors.erase(addr);
         return true;
     }
 
     return false;
 }
+
+bool deepspeed_pin_tensor_t::is_managed(const torch::Tensor& buffer)
+{
+    if (!buffer.is_cpu()) { return false; }
+    auto addr = buffer.data_ptr();
+    if (_locked_tensors.find(addr) != _locked_tensors.end()) { return true; }
+    return false;
+};
diff --git a/csrc/aio/py_lib/deepspeed_pin_tensor.h b/csrc/aio/py_lib/deepspeed_pin_tensor.h
index 4350a4ac7df67..4b8ad7e760858 100644
--- a/csrc/aio/py_lib/deepspeed_pin_tensor.h
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.h
@@ -15,13 +15,16 @@ Functionality for managing CPU tensors occupying page-locked memory.
 #include "deepspeed_py_aio.h"
 
 struct deepspeed_pin_tensor_t {
-    std::map<void*, size_t> _locked_tensors;
+    std::map<void*, int64_t> _locked_tensors;
 
     deepspeed_pin_tensor_t() = default;
 
     ~deepspeed_pin_tensor_t();
 
-    torch::Tensor alloc(const size_t num_elem, const at::ScalarType& elem_type);
+    torch::Tensor alloc(const int64_t num_elem, const at::ScalarType& elem_type);
+    torch::Tensor alloc(const int64_t num_elem, const torch::TensorOptions& options);
 
     bool free(torch::Tensor& locked_tensor);
+
+    bool is_managed(const torch::Tensor& buffer);
 };
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp
index eac268d334338..02b04057d1ac3 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@@ -51,7 +51,7 @@ int deepspeed_py_aio_write(const torch::Tensor& buffer,
     if (fd == -1) { return -1; }
 
     auto write_buffer = (char*)buffer.data_ptr();
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
     std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
     std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
 
@@ -83,7 +83,7 @@ int deepspeed_py_aio_read(torch::Tensor& buffer,
                           const bool validate)
 {
     const auto start_time = std::chrono::high_resolution_clock::now();
-    long long num_file_bytes;
+    int64_t num_file_bytes;
     if (-1 == get_file_size(filename, num_file_bytes)) {
         const auto error_code = errno;
         report_file_error(filename, " fstat for read", error_code);
@@ -95,7 +95,7 @@ int deepspeed_py_aio_read(torch::Tensor& buffer,
     if (fd == -1) { return -1; }
 
     auto read_buffer = (char*)buffer.data_ptr();
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+    assert(static_cast<int64_t>(buffer.nbytes()) == num_file_bytes);
 
     std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
     std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index c7ca5e82afdef..2b1093e992865 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -16,8 +16,12 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
                                                const int queue_depth,
                                                const bool single_submit,
                                                const bool overlap_events,
-                                               const int num_threads)
-    : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, num_threads)
+                                               const int intra_op_parallelism)
+    : deepspeed_io_handle_t(block_size,
+                            queue_depth,
+                            single_submit,
+                            overlap_events,
+                            intra_op_parallelism)
 {
 }
 
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
index eb6b90ea22f03..1398df9a56c96 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -16,7 +16,7 @@ struct deepspeed_aio_handle_t : deepspeed_io_handle_t {
                            const int queue_depth,
                            const bool single_submit,
                            const bool overlap_events,
-                           const int num_threads);
+                           const int intra_op_parallelism);
 
     ~deepspeed_aio_handle_t();
 };
diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
index bdf2a858d797a..48ea8a1339d45 100644
--- a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
@@ -18,16 +18,16 @@ deepspeed_io_handle_t::deepspeed_io_handle_t(const int block_size,
                                              const int queue_depth,
                                              const bool single_submit,
                                              const bool overlap_events,
-                                             const int num_threads)
+                                             const int intra_op_parallelism)
     : _aio_ctxt(new aio_context(block_size, queue_depth)),
       _single_submit(single_submit),
       _overlap_events(overlap_events),
-      _num_threads(num_threads),
+      _intra_op_parallelism(intra_op_parallelism),
       _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
       _num_pending_ops(0),
       _pinned_tensor_mgr(new deepspeed_pin_tensor_t())
 {
-    for (auto i = 0; i < num_threads; ++i) {
+    for (auto i = 0; i < intra_op_parallelism; ++i) {
         _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
     }
 
@@ -56,7 +56,7 @@ const bool deepspeed_io_handle_t::get_single_submit() const { return _single_sub
 
 const bool deepspeed_io_handle_t::get_overlap_events() const { return _overlap_events; }
 
-const int deepspeed_io_handle_t::get_thread_count() const { return _num_threads; }
+const int deepspeed_io_handle_t::get_intra_op_parallelism() const { return _intra_op_parallelism; }
 
 int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
 {
@@ -64,13 +64,13 @@ int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, con
 
     assert(_aio_ctxt);
 
-    long long num_file_bytes;
+    int64_t num_file_bytes;
     if (-1 == get_file_size(filename, num_file_bytes)) {
         const auto error_code = errno;
         report_file_error(filename, " fstat for read", error_code);
         return -1;
     }
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+    assert(static_cast<int64_t>(buffer.nbytes()) == num_file_bytes);
 
     const auto fd = open_file(filename, true);
     if (fd == -1) { return -1; }
@@ -108,7 +108,7 @@ int deepspeed_io_handle_t::write(const torch::Tensor& buffer,
     if (fd == -1) { return -1; }
 
     auto write_buffer = (char*)buffer.data_ptr();
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
     std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
 
     if (_aio_config._overlap_events) {
@@ -188,13 +188,12 @@ int deepspeed_io_handle_t::wait()
     return num_completed_ops;
 }
 
-bool deepspeed_io_handle_t::_is_valid_parallel_aio_op(const bool read_op,
-                                                      const long long int num_bytes)
+bool deepspeed_io_handle_t::_is_valid_parallel_aio_op(const bool read_op, const int64_t num_bytes)
 {
     const auto op_string = read_op ? "Read" : "Write";
-    if (num_bytes % get_thread_count()) {
+    if (num_bytes % get_intra_op_parallelism()) {
         std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
-                  << " not divisible by thread count = " << get_thread_count() << std::endl;
+                  << " not divisible by thread count = " << get_intra_op_parallelism() << std::endl;
         return false;
     }
 
@@ -206,11 +205,17 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_io_handle_t::_create_io_op_desc(
     const torch::Tensor& buffer,
     const int fd,
     const char* filename,
-    const long long int file_num_bytes,
+    const int64_t file_num_bytes,
     const bool validate)
 {
-    return std::make_shared<cpu_op_desc_t>(
-        read_op, buffer, fd, filename, file_num_bytes, _num_threads, validate);
+    return std::make_shared<cpu_op_desc_t>(read_op,
+                                           buffer,
+                                           _pinned_tensor_mgr,
+                                           fd,
+                                           filename,
+                                           file_num_bytes,
+                                           _intra_op_parallelism,
+                                           validate);
 }
 
 int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
@@ -218,19 +223,19 @@ int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
                                  const bool validate,
                                  const bool async)
 {
-    long long num_file_bytes;
+    int64_t num_file_bytes;
     if (-1 == get_file_size(filename, num_file_bytes)) {
         const auto error_code = errno;
         report_file_error(filename, " fstat for read", error_code);
         return -1;
     }
-    const auto buffer_bytes = static_cast<long long int>(buffer.nbytes());
+    const auto buffer_bytes = static_cast<int64_t>(buffer.nbytes());
     if (buffer_bytes != num_file_bytes) {
         std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
                   << " != " << num_file_bytes << std::endl;
     }
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-    assert((num_file_bytes % _num_threads) == 0);
+    assert(buffer_bytes == num_file_bytes);
+    assert((num_file_bytes % _intra_op_parallelism) == 0);
 
     if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
 
@@ -251,8 +256,8 @@ int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer,
                                   const bool validate,
                                   const bool async)
 {
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    assert((num_write_bytes % _num_threads) == 0);
+    const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
+    assert((num_write_bytes % _intra_op_parallelism) == 0);
 
     if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
 
@@ -288,7 +293,7 @@ int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer, const char*
     return pwrite(buffer, filename, false, true);
 }
 
-at::Tensor deepspeed_io_handle_t::new_cpu_locked_tensor(const size_t num_elem,
+at::Tensor deepspeed_io_handle_t::new_cpu_locked_tensor(const int64_t num_elem,
                                                         const torch::Tensor& example_tensor)
 {
     return _pinned_tensor_mgr->alloc(num_elem, example_tensor.scalar_type());
diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.h b/csrc/aio/py_lib/deepspeed_py_io_handle.h
index 2974ebe87bfc1..4fedf80808189 100644
--- a/csrc/aio/py_lib/deepspeed_py_io_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.h
@@ -16,7 +16,7 @@ struct deepspeed_io_handle_t {
     std::unique_ptr<struct aio_context> _aio_ctxt;
     const bool _single_submit;
     const bool _overlap_events;
-    const int _num_threads;
+    const int _intra_op_parallelism;
     deepspeed_aio_config_t _aio_config;
 
     std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
@@ -28,7 +28,7 @@ struct deepspeed_io_handle_t {
                           const int queue_depth,
                           const bool single_submit,
                           const bool overlap_events,
-                          const int num_threads);
+                          const int intra_op_parallelism);
 
     virtual ~deepspeed_io_handle_t() = 0;
 
@@ -36,7 +36,7 @@ struct deepspeed_io_handle_t {
     const int get_queue_depth() const;
     const bool get_single_submit() const;
     const bool get_overlap_events() const;
-    const int get_thread_count() const;
+    const int get_intra_op_parallelism() const;
 
     int read(torch::Tensor& buffer, const char* filename, const bool validate);
 
@@ -61,7 +61,8 @@ struct deepspeed_io_handle_t {
     int async_pwrite(const torch::Tensor& buffer, const char* filename);
 
     // TODO: Make API's args to be shape and dtype.
-    torch::Tensor new_cpu_locked_tensor(const size_t num_elem, const torch::Tensor& example_tensor);
+    torch::Tensor new_cpu_locked_tensor(const int64_t num_elem,
+                                        const torch::Tensor& example_tensor);
 
     bool free_cpu_locked_tensor(torch::Tensor&);
 
@@ -73,13 +74,12 @@ struct deepspeed_io_handle_t {
 
     std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
 
-    bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes);
+    bool _is_valid_parallel_aio_op(const bool read_op, const int64_t num_bytes);
 
-    virtual std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(
-        const bool read_op,
-        const torch::Tensor& buffer,
-        const int fd,
-        const char* filename,
-        const long long int file_num_bytes,
-        const bool validate);
+    virtual std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(const bool read_op,
+                                                                    const torch::Tensor& buffer,
+                                                                    const int fd,
+                                                                    const char* filename,
+                                                                    const int64_t file_num_bytes,
+                                                                    const bool validate);
 };
diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
index 3171d0c6bf3c2..b80fa2d6c8e6d 100644
--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -27,13 +27,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
              "queue_depth"_a = 128,
              "single_submit"_a = false,
              "overlap_events"_a = false,
-             "num_threads"_a = 1)
+             "intra_op_parallelism"_a = 1)
 
         .def("get_block_size", &deepspeed_aio_handle_t::get_block_size)
         .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth)
         .def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit)
         .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
-        .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count)
+        .def("get_intra_op_parallelism", &deepspeed_aio_handle_t::get_intra_op_parallelism)
 
         .def("read",
              &deepspeed_aio_handle_t::read,
diff --git a/csrc/gds/py_lib/deepspeed_gds_op.cpp b/csrc/gds/py_lib/deepspeed_gds_op.cpp
index c370a448e5a21..f49f74394374e 100644
--- a/csrc/gds/py_lib/deepspeed_gds_op.cpp
+++ b/csrc/gds/py_lib/deepspeed_gds_op.cpp
@@ -58,7 +58,6 @@ void gds_op_desc_t::add_buffer_to_registry(const torch::Tensor& buffer)
     const int64_t device = buffer.get_device();
     void* reg_ptr = buffer.data_ptr();
 
-    // std::cout << "REG PTR " <<  reg_ptr << std::endl;
     // TODO: add checking to make sure pointer isn't already in set
     const auto it = base_ptr_registry.find(device);
     if (it == base_ptr_registry.end()) {
@@ -94,10 +93,10 @@ gds_op_desc_t::gds_op_desc_t(const bool read_op,
                              const torch::Tensor& buffer,
                              const int fd,
                              const char* filename,
-                             const long long int file_num_bytes,
-                             const int num_threads,
+                             const int64_t file_num_bytes,
+                             const int intra_op_parallelism,
                              const bool validate)
-    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate)
+    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, intra_op_parallelism, validate)
 {
     _contiguous_buffer = _buffer.contiguous();
     const int64_t device = _buffer.get_device();
@@ -123,7 +122,7 @@ void gds_op_desc_t::run(const int tid,
                         std::unique_ptr<aio_context>& aio_ctxt,
                         deepspeed_aio_config_t* aio_config)
 {
-    assert(tid < _num_threads);
+    assert(tid < _intra_op_parallelism);
     check_cudaruntimecall(cudaSetDevice(_buffer.get_device()));
     int64_t buf_offset = data_ptr() + (_num_bytes_per_thread * tid) - (char*)_base_ptr;
     const auto file_offset = _num_bytes_per_thread * tid;
diff --git a/csrc/gds/py_lib/deepspeed_gds_op.h b/csrc/gds/py_lib/deepspeed_gds_op.h
index b7fab64d40549..380bb0b9b6aed 100644
--- a/csrc/gds/py_lib/deepspeed_gds_op.h
+++ b/csrc/gds/py_lib/deepspeed_gds_op.h
@@ -22,8 +22,8 @@ struct gds_op_desc_t : io_op_desc_t {
                   const torch::Tensor& buffer,
                   const int fd,
                   const char* filename,
-                  const long long int file_num_bytes,
-                  const int num_threads,
+                  const int64_t file_num_bytes,
+                  const int intra_op_parallelism,
                   const bool validate);
 
     void run(const int tid,
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
index 15fd516acaae5..c052144a0190b 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
@@ -19,21 +19,25 @@ deepspeed_gds_handle_t::deepspeed_gds_handle_t(const int block_size,
                                                const int queue_depth,
                                                const bool single_submit,
                                                const bool overlap_events,
-                                               const int num_threads)
-    : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, 1)
+                                               const int intra_op_parallelism)
+    : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, 1),
+      _intra_gds_op_parallelism(intra_op_parallelism)
 {
-    _init_cuFile(block_size, queue_depth, num_threads);
+    _init_cuFile(block_size, queue_depth);
 }
 
 deepspeed_gds_handle_t::~deepspeed_gds_handle_t() { _close_cuFile(); }
 
-void deepspeed_gds_handle_t::_init_cuFile(const int block_size,
-                                          const int queue_depth,
-                                          const int num_threads)
+const int deepspeed_gds_handle_t::get_intra_op_parallelism() const
+{
+    return _intra_gds_op_parallelism;
+}
+
+void deepspeed_gds_handle_t::_init_cuFile(const int block_size, const int queue_depth)
 {
     if (deepspeed_gds_handle_t::s_cuFile_init == 0) {
         std::string depthStr = std::to_string(queue_depth);
-        std::string threadsStr = std::to_string(num_threads);
+        std::string threadsStr = std::to_string(_intra_gds_op_parallelism);
         std::string json1 = R"({"execution": {"max_io_queue_depth": )" + depthStr + ", ";
         std::string json2 = R"("max_request_parallelism": )" + threadsStr + ", ";
         std::string json3 = R"("max_io_threads": )" + threadsStr + ", ";
@@ -102,12 +106,12 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_gds_handle_t::_create_io_op_desc(
     const torch::Tensor& buffer,
     const int fd,
     const char* filename,
-    const long long int file_num_bytes,
+    const int64_t file_num_bytes,
     const bool validate)
 {
     if (buffer.is_cuda()) {
         return std::make_shared<gds_op_desc_t>(
-            read_op, buffer, fd, filename, file_num_bytes, _num_threads, validate);
+            read_op, buffer, fd, filename, file_num_bytes, _intra_op_parallelism, validate);
     }
     return deepspeed_io_handle_t::_create_io_op_desc(
         read_op, buffer, fd, filename, file_num_bytes, validate);
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
index f324e6b65e80b..131e83e7b838a 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.h
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
@@ -12,11 +12,13 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 #include "deepspeed_py_io_handle.h"
 
 struct deepspeed_gds_handle_t : deepspeed_io_handle_t {
+    const int _intra_gds_op_parallelism;
+
     deepspeed_gds_handle_t(const int block_size,
                            const int queue_depth,
                            const bool single_submit,
                            const bool overlap_events,
-                           const int num_threads);
+                           const int intra_op_parallelism);
 
     ~deepspeed_gds_handle_t();
 
@@ -29,15 +31,17 @@ struct deepspeed_gds_handle_t : deepspeed_io_handle_t {
 
     bool unpin_device_tensor(const torch::Tensor& buffer);
 
-    void _init_cuFile(const int block_size, const int queue_length, const int num_threads);
+    void _init_cuFile(const int block_size, const int queue_depth);
 
     void _close_cuFile();
 
+    const int get_intra_op_parallelism() const;
+
     std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(const bool read_op,
                                                             const torch::Tensor& buffer,
                                                             const int fd,
                                                             const char* filename,
-                                                            const long long int file_num_bytes,
+                                                            const int64_t file_num_bytes,
                                                             const bool validate);
 
     static int s_cuFile_init;
diff --git a/csrc/gds/py_lib/py_ds_gds.cpp b/csrc/gds/py_lib/py_ds_gds.cpp
index 66eb34d4ea8cf..57bf8d2207c45 100644
--- a/csrc/gds/py_lib/py_ds_gds.cpp
+++ b/csrc/gds/py_lib/py_ds_gds.cpp
@@ -20,13 +20,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
              "queue_depth"_a = 128,
              "single_submit"_a = false,
              "overlap_events"_a = false,
-             "num_threads"_a = 1)
+             "intra_op_parallelism"_a = 1)
 
         .def("get_block_size", &deepspeed_gds_handle_t::get_block_size)
         .def("get_queue_depth", &deepspeed_gds_handle_t::get_queue_depth)
         .def("get_single_submit", &deepspeed_gds_handle_t::get_single_submit)
         .def("get_overlap_events", &deepspeed_gds_handle_t::get_overlap_events)
-        .def("get_thread_count", &deepspeed_gds_handle_t::get_thread_count)
+        .def("get_intra_op_parallelism", &deepspeed_gds_handle_t::get_intra_op_parallelism)
 
         .def("read",
              &deepspeed_gds_handle_t::read,
diff --git a/csrc/transformer/inference/csrc/pt_binding.cpp b/csrc/transformer/inference/csrc/pt_binding.cpp
index 2d5332578edc5..19dbe73726f74 100644
--- a/csrc/transformer/inference/csrc/pt_binding.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding.cpp
@@ -452,14 +452,17 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                                            unsigned layer_id,
                                            unsigned num_layers,
                                            at::Tensor& alibi,
-                                           float rope_theta)
+                                           float rope_theta,
+                                           bool is_prompt,
+                                           std::optional<at::Tensor> token_idx,
+                                           std::optional<at::Tensor> position_ids)
 {
     unsigned bsz = query_key_value.size(0);
     unsigned seq_len = query_key_value.size(1);
     int k = query_key_value.size(2) / (heads + 2 * (num_kv > 0 ? num_kv : heads));
     unsigned hidden_dim = heads * k;
 
-    bool is_prompt = (seq_len > 1);
+    is_prompt = (seq_len > 1);
 
     if (is_prompt) InferenceContext::Instance().reset_tokens(seq_len);
     unsigned soft_len = InferenceContext::Instance().current_tokens();
@@ -2031,7 +2034,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           "DeepSpeed memory allocation for GPT inference with " #_name " (CUDA)");                \
     m.def("dequantize_" #_name,                                                                   \
           &ds_dequantize<_dtype>,                                                                 \
-          "DeepSpeed dequantize with " #_name " (CUDA)")
+          "DeepSpeed dequantize with " #_name " (CUDA)");
 
     DEF_OPS(fp32, float);
     DEF_OPS(fp16, __half);
diff --git a/csrc/xpu/aio/deepspeed_cpu_op.cpp b/csrc/xpu/aio/deepspeed_cpu_op.cpp
new file mode 100644
index 0000000000000..ee98c2d5cac2f
--- /dev/null
+++ b/csrc/xpu/aio/deepspeed_cpu_op.cpp
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "deepspeed_cpu_op.h"
+
+using namespace std;
+
+cpu_op_desc_t::cpu_op_desc_t(const bool read_op,
+                             const torch::Tensor& buffer,
+                             const int fd,
+                             const char* filename,
+                             const long long int file_num_bytes,
+                             const int num_threads,
+                             const bool validate)
+    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate),
+      _cpu_buffer(buffer)
+{
+    // XPU don't handle buffer here. See XPU Accelerator pin_memory.
+    _contiguous_buffer = _cpu_buffer.contiguous();
+}
+
+char* cpu_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
+
+void cpu_op_desc_t::finish()
+{
+    if (_read_op && _buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); }
+}
+
+void cpu_op_desc_t::validate()
+{
+    validate_aio_operation(_read_op, _filename.c_str(), data_ptr(), _file_num_bytes);
+}
+
+void cpu_op_desc_t::run(const int tid,
+                        std::unique_ptr<aio_context>& aio_ctxt,
+                        deepspeed_aio_config_t* aio_config)
+{
+    assert(tid < _num_threads);
+    const auto base_offset = _num_bytes_per_thread * tid;
+
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
+        new io_xfer_ctxt(_fd, base_offset, _num_bytes_per_thread, data_ptr()));
+
+    if (aio_config->_overlap_events) {
+        do_aio_operation_overlap(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr);
+    }
+}
diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
index 74fc7f74f6bdd..6574d49fb1323 100755
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@@ -53,12 +53,7 @@ def __init__(self, model, config):
         DS_INFERENCE_ENABLED = True
 
         super().__init__()
-
-        # Have to import here because inference_module is a global, but python
-        # globals only work at the module level and will not be updated unless
-        # we import it each time we init a new inference engine.
-        from ..model_implementations.transformers.ds_transformer import inference_module
-        if inference_module is not None:
+        if DeepSpeedTransformerInference.workspace is not None:
             self.destroy()
 
         self.module = model
@@ -73,8 +68,9 @@ def __init__(self, model, config):
         if hasattr(self.module, "config"):
             TransformerPolicy.hf_model_config = self.module.config
 
-        if config.dtype == torch.half and not get_accelerator().is_fp16_supported():
-            raise ValueError("Type fp16 is not supported.")
+        if config.dtype not in get_accelerator().supported_dtypes():
+            raise ValueError(
+                f"Data type {config.dtype} is not supported by {get_accelerator().device_name()} accelerator")
 
         # todo: keep this self.injection_dict because we don't use to change config.injection_policy API
         # todo: this will get changed when Molly's PR on auto injection dict is merged
@@ -190,15 +186,11 @@ def __init__(self, model, config):
         self._is_compiled = False
 
     def destroy(self):
-        # Have to import here because inference_module is a global, but python
-        # globals only work at the module level and will not be updated unless
-        # we import it each time we init a new inference engine.
-        from ..model_implementations.transformers.ds_transformer import inference_module
         DeepSpeedTransformerInference.layer_id = 0
         DeepSpeedSelfAttention.num_layers = 0
-        if inference_module is not None:
-            inference_module.release_workspace()
-            inference_module = None
+        if DeepSpeedTransformerInference.workspace.is_allocated():
+            DeepSpeedTransformerInference.workspace.release_workspace()
+        DeepSpeedTransformerInference.workspace = None
 
     def profile_model_time(self, use_cuda_events=True):
         if not self.model_profile_enabled and not self._config.enable_cuda_graph:
@@ -324,7 +316,7 @@ def _validate_args(self, mpu, replace_with_kernel_inject):
         if self._config.checkpoint is not None and not isinstance(self._config.checkpoint, (str, dict)):
             raise ValueError(f"checkpoint must be None, str or dict, got {type(self._config.checkpoint)}")
 
-        supported_dtypes = [None, torch.half, torch.int8, torch.float]
+        supported_dtypes = [None, torch.half, torch.int8, torch.float, torch.bfloat16]
         if self._config.dtype not in supported_dtypes:
             raise ValueError(f"{self._config.dtype} not supported, valid dtype: {supported_dtypes}")
 
diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py
index a816b56857603..74d20a6d53e52 100644
--- a/deepspeed/launcher/multinode_runner.py
+++ b/deepspeed/launcher/multinode_runner.py
@@ -141,6 +141,17 @@ def validate_args(self):
     def get_cmd(self, environment, active_resources):
         total_process_count = sum(self.resource_pool.values())
 
+        launcher_args = split(self.args.launcher_args)
+
+        # If btl_tcp_if_include option is provided through launcher_args, we use it. Otherwise, we add
+        # `--mca btl_tcp_if_include eth0` option as a default value for compatibility.
+        btl_tcp_opt = ['--mca', 'btl_tcp_if_include', 'eth0']
+        if len(launcher_args) >= 2:
+            for i in range(len(launcher_args) - 1):
+                if launcher_args[i] in ['-mca', '--mca'] and launcher_args[i + 1] == 'btl_tcp_if_include':
+                    btl_tcp_opt = []
+                    break
+
         mpirun_cmd = [
             'mpirun',
             '-n',
@@ -150,10 +161,7 @@ def get_cmd(self, environment, active_resources):
             '--mca',
             'btl',
             '^openib',
-            '--mca',
-            'btl_tcp_if_include',
-            'eth0',
-        ] + split(self.args.launcher_args)
+        ] + btl_tcp_opt + launcher_args
 
         export_cmd = []
         for k, v in self.exports.items():
diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index 076f65b201b34..5f926834dda22 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -20,6 +20,8 @@
 from copy import deepcopy
 import signal
 import time
+from typing import Tuple, List, Dict
+from collections import defaultdict
 import shlex
 
 from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner, SlurmRunner, MPICHRunner, IMPIRunner
@@ -263,6 +265,31 @@ def _stable_remove_duplicates(data):
     return new_list
 
 
+def parse_node_config(node_config: str) -> Tuple[str, List[int]]:
+    SLOT_LIST_START = ':'
+    SLOT_SEP = ','
+
+    if SLOT_LIST_START not in node_config:
+        return node_config, []
+
+    hostname, slots = node_config.split(SLOT_LIST_START)
+    slots = [int(x) for x in slots.split(SLOT_SEP)]
+
+    return hostname, slots
+
+
+def parse_node_config_list(node_config_list: List[str]) -> Dict[str, List[int]]:
+    NODE_SEP = '@'
+
+    node_configs = defaultdict(list)
+
+    for node_config in node_config_list.split(NODE_SEP):
+        hostname, slots = parse_node_config(node_config)
+        node_configs[hostname] += slots
+
+    return {k: sorted(list(set(v))) for k, v in node_configs.items()}
+
+
 def parse_resource_filter(host_info, include_str="", exclude_str=""):
     '''Parse an inclusion or exclusion string and filter a hostfile dictionary.
 
@@ -277,11 +304,6 @@ def parse_resource_filter(host_info, include_str="", exclude_str=""):
           slot 0 on worker-1.
     '''
 
-    # Constants that define our syntax
-    NODE_SEP = '@'
-    SLOT_LIST_START = ':'
-    SLOT_SEP = ','
-
     # Ensure include/exclude are mutually exclusive
     if (include_str != "") and (exclude_str != ""):
         raise ValueError('include_str and exclude_str are mutually exclusive.')
@@ -299,12 +321,9 @@ def parse_resource_filter(host_info, include_str="", exclude_str=""):
         parse_str = exclude_str
 
     # foreach node in the list
-    for node_config in parse_str.split(NODE_SEP):
+    for hostname, slots in parse_node_config_list(parse_str).items():
         # Node can either be alone or node:slot,slot,slot
-        if SLOT_LIST_START in node_config:
-            hostname, slots = node_config.split(SLOT_LIST_START)
-            slots = [int(x) for x in slots.split(SLOT_SEP)]
-
+        if len(slots) > 0:
             # sanity checks
             if hostname not in host_info:
                 raise ValueError(f"Hostname '{hostname}' not found in hostfile")
@@ -322,7 +341,6 @@ def parse_resource_filter(host_info, include_str="", exclude_str=""):
 
         # User just specified the whole node
         else:
-            hostname = node_config
             # sanity check hostname
             if hostname not in host_info:
                 raise ValueError(f"Hostname '{hostname}' not found in hostfile")
@@ -355,8 +373,10 @@ def parse_resource_filter(host_info, include_str="", exclude_str=""):
 
 def parse_inclusion_exclusion(resource_pool, inclusion, exclusion):
     active_resources = collections.OrderedDict()
+    node_configs = parse_node_config_list(inclusion)
+
     for hostname, slots in resource_pool.items():
-        active_resources[hostname] = list(range(slots))
+        active_resources[hostname] = node_configs[hostname] if hostname in node_configs else list(range(slots))
 
     return parse_resource_filter(active_resources, include_str=inclusion, exclude_str=exclusion)
 
diff --git a/deepspeed/model_implementations/transformers/ds_llama2.py b/deepspeed/model_implementations/transformers/ds_llama2.py
index 7d9eb4113a8a5..325bfb4f7e181 100644
--- a/deepspeed/model_implementations/transformers/ds_llama2.py
+++ b/deepspeed/model_implementations/transformers/ds_llama2.py
@@ -4,11 +4,8 @@
 # DeepSpeed Team
 
 import torch
-from deepspeed import comm as dist
 from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 
-inference_module = None
-
 
 class DeepSpeedLlama2Inference(DeepSpeedTransformerInference):
     """Initialize the DeepSpeed OPT Transformer Layer.
@@ -27,18 +24,10 @@ def forward(self, *args, **kwargs):
 
         input = args[0]
         input_mask = None
-        # Allocate memory only on first layer forward
-        if self.config.layer_id == 0 and self._alloc_workspace:
-            self.allocate_workspace(self.config.hidden_size, self.config.heads,
-                                    input.size()[1],
-                                    input.size()[0], DeepSpeedTransformerInference.layer_id, self.config.mp_size,
-                                    self.config.bigscience_bloom,
-                                    dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens,
-                                    self.config.min_out_tokens)
-            self._alloc_workspace = False
-
         get_present = True
 
+        self.allocate_workspace(input.size())
+
         # We set the prev key/value to None when there is a prompt
         if input.shape[1] > 1:
             self.layer_past = None
diff --git a/deepspeed/model_implementations/transformers/ds_transformer.py b/deepspeed/model_implementations/transformers/ds_transformer.py
index d87d0de997b51..360113b78a3db 100644
--- a/deepspeed/model_implementations/transformers/ds_transformer.py
+++ b/deepspeed/model_implementations/transformers/ds_transformer.py
@@ -6,19 +6,18 @@
 import torch
 import torch.nn as nn
 from deepspeed import comm as dist
+from deepspeed.ops.transformer.inference.op_binding.layer_norm import LayerNormOp
 from deepspeed.utils.logging import log_dist
 
 from deepspeed.ops.transformer.inference.ds_mlp import DeepSpeedMLP
 from deepspeed.ops.transformer.inference.ds_attention import DeepSpeedSelfAttention, BloomSelfAttention
+from deepspeed.ops.transformer.inference.op_binding.workspace import WorkspaceOp
 from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import InferenceBuilder
 import deepspeed
 if deepspeed.HAS_TRITON:
     from deepspeed.ops.transformer.inference.triton.mlp import TritonMLP
     from deepspeed.ops.transformer.inference.triton.attention import TritonSelfAttention
 
-inference_module = None
-
 
 class DeepSpeedTransformerInference(nn.Module):
     """Initialize the DeepSpeed Transformer Layer.
@@ -37,6 +36,7 @@ class DeepSpeedTransformerInference(nn.Module):
                 for specific downstream tasks.
     """
     layer_id = 0
+    workspace = None
 
     def __init__(self,
                  config,
@@ -52,10 +52,6 @@ def __init__(self,
         DeepSpeedTransformerInference.layer_id += 1
 
         data_type = torch.half if self.config.dtype == torch.int8 else self.config.dtype
-        global inference_module
-        if inference_module is None:
-            builder = InferenceBuilder()
-            inference_module = builder.load()
 
         if DeepSpeedTransformerInference.layer_id == 1:
             log_dist(f"DeepSpeed-Inference config: {self.config.__dict__}", [0])
@@ -88,22 +84,25 @@ def __init__(self,
             self.norm_b = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type, device=device),
                                        requires_grad=False)
         self.layer_past = None
-        try:
-            if config.dtype == torch.float32:
-                self.allocate_workspace = inference_module.allocate_workspace_fp32
-            elif config.dtype == torch.bfloat16:
-                self.allocate_workspace = inference_module.allocate_workspace_bf16
-            else:
-                self.allocate_workspace = inference_module.allocate_workspace_fp32
-            self._alloc_workspace = True
-        except AttributeError:
-            self.allocate_workspace = None
-            self._alloc_workspace = False
+        self.layer_norm = LayerNormOp()
+        if DeepSpeedTransformerInference.workspace is None:
+            DeepSpeedTransformerInference.workspace = WorkspaceOp(self.config)
+        self._should_allocate_workspace = True
+
+    def allocate_workspace(self, size):
+        # Allocate memory only on first layer forward
+        if self.config.layer_id == 0 and self._should_allocate_workspace:
+            DeepSpeedTransformerInference.workspace.allocate_workspace(
+                self.config.hidden_size, self.config.heads, size[1], size[0], DeepSpeedTransformerInference.layer_id,
+                self.config.mp_size, self.config.bigscience_bloom,
+                dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens,
+                self.config.min_out_tokens)
+            self._should_allocate_workspace = False
 
     @classmethod
     def reset_cache(cls):
-        if inference_module is not None:
-            inference_module.reset_cache()
+        if cls.workspace is not None:
+            cls.workspace.reset_cache()
 
     def forward(
             self,
@@ -136,15 +135,7 @@ def forward(
 
         input_mask = (input_mask if attn_mask is None else attn_mask) if attention_mask is None else attention_mask
 
-        # Allocate memory only on first layer forward
-        if self.config.layer_id == 0 and self._alloc_workspace:
-            self.allocate_workspace(self.config.hidden_size, self.config.heads,
-                                    input.size()[1],
-                                    input.size()[0], DeepSpeedTransformerInference.layer_id, self.config.mp_size,
-                                    self.config.bigscience_bloom,
-                                    dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens,
-                                    self.config.min_out_tokens)
-            self._alloc_workspace = False
+        self.allocate_workspace(input.size())
 
         get_present = (get_present or get_key_value or use_cache)
         input_mask = input_mask if attention_mask is None else attention_mask
@@ -178,14 +169,15 @@ def forward(
                                               output_attentions,
                                               self.norm_w,
                                               self.norm_b,
-                                              alibi)
+                                              alibi,
+                                              **kwargs)
 
             presents = (key, value)
             self.layer_past = presents if layer_past is None else None
             output = self.mlp(attention_output, input, inp_norm, self.attention.attn_ob)
 
             if not self.config.pre_layer_norm:
-                output = inference_module.layer_norm(output, self.norm_w, self.norm_b, self.config.epsilon)
+                output = self.layer_norm(output, self.norm_w, self.norm_b, self.config.epsilon)
 
             output = output.to(input_type)
         if get_present:
diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
old mode 100644
new mode 100755
index e6eea2183de5a..221d490a37d2a
--- a/deepspeed/module_inject/auto_tp.py
+++ b/deepspeed/module_inject/auto_tp.py
@@ -333,7 +333,8 @@ def _replace(self, child, name, conv_linear_layer):
         weight_shape = child.weight.shape
         mp_replace = ReplaceWithTensorSlicing(mp_group=self.mp_group)
         # For mixtral-7x8b, need to skip MoE gate linear replace.
-        if name == "block_sparse_moe.gate":
+        if name == "block_sparse_moe.gate" or (('mlp.shared_expert_gate' == name or 'mlp.gate' == name)
+                                               and 'qwen2_moe' in str(type(self.module))):
             return child
         # For Yuan model
         if 'Yuan' in str(self.module):
@@ -495,7 +496,8 @@ def get_model_num_kv_heads(self, config):
         num_kv_heads = None
         # multi_query_group_num is for chatglm2 & chatglm3
         kv_head_names = [
-            'multi_query_group_num', 'num_kv_heads', 'num_key_value_heads', 'num_attention_heads', 'n_heads'
+            'multi_query_group_num', 'num_kv_heads', 'num_key_value_heads', 'num_attention_heads', 'n_heads',
+            'attention_heads'
         ]
         for name in kv_head_names:
             if hasattr(config, name):
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index 64dc5479940c3..cf70c4530c822 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -274,7 +274,13 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
         _autotp.set_tensor_parallel_config(config.tensor_parallel.tp_size, config.tensor_parallel.tp_group)
 
         # 3. Try to get num_key_heads from model_config.num_key_value_heads
-        num_kv_heads = _autotp.get_model_num_kv_heads(model_config)
+        if hasattr(model_config, "vision_config"):
+            if "MllamaVisionEncoderLayer" in str(module):
+                num_kv_heads = _autotp.get_model_num_kv_heads(model_config.vision_config)
+            else:
+                num_kv_heads = _autotp.get_model_num_kv_heads(model_config.text_config)
+        else:
+            num_kv_heads = _autotp.get_model_num_kv_heads(model_config)
 
         # 4. When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division
         set_num_kv_heads(num_kv_heads)
@@ -339,6 +345,8 @@ def set_lm_head(module):
                                                       "weight") and not module.embed_out.weight.is_meta and isinstance(
                                                           module.embed_out, torch.nn.Linear):
             module = replace_wo_policy(module, ("embed_out", ), 0, "embed_out")
+        elif hasattr(module.language_model, "lm_head"):
+            module = replace_wo_policy(module.language_model, ("lm_head", ), 0, "lm_head")
         return module
 
     def conv2d_parallel_shard_weights(model, rank, world_size):
diff --git a/deepspeed/nvme/__init__.py b/deepspeed/nvme/__init__.py
new file mode 100644
index 0000000000000..6d0de857cbd34
--- /dev/null
+++ b/deepspeed/nvme/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .perf_run_sweep import sweep_main, parse_sweep_arguments
+from .perf_generate_param import generate_main
+from .test_ds_aio import ds_io_main
diff --git a/deepspeed/nvme/ds_aio_args.py b/deepspeed/nvme/ds_aio_args.py
new file mode 100644
index 0000000000000..9ed71c34a74d5
--- /dev/null
+++ b/deepspeed/nvme/ds_aio_args.py
@@ -0,0 +1,175 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import argparse
+import os
+from .test_ds_aio_utils import refine_integer_value
+from deepspeed.accelerator import get_accelerator
+
+MAPPING_DELIMITER = ':'
+
+
+def refine_args(args):
+    if args.io_size and type(args.io_size) == str:
+        args.io_size = refine_integer_value(args.io_size)
+
+    if args.block_size and type(args.block_size) == str:
+        args.block_size = refine_integer_value(args.block_size)
+
+    return args
+
+
+def _get_mapping_dict(args):
+    if args.folder is not None:
+        d = {i: args.folder for i in range(args.multi_process)}
+    else:
+        d = {}
+        for m in args.folder_to_device_mapping:
+            fields = m.split(MAPPING_DELIMITER)
+            d[fields[1]] = fields[0]
+
+    return d
+
+
+def _validate_folder_mapping(args):
+    no_error = True
+    error_messages = []
+    invalid_mappings = [m for m in args.folder_to_device_mapping if MAPPING_DELIMITER not in m]
+    if len(invalid_mappings) > 0:
+        error_messages.append(
+            f'Missing delimiter ({MAPPING_DELIMITER}) in folder_to_device_mapping {invalid_mappings}')
+        no_error = False
+
+    folder_list = [m.split(MAPPING_DELIMITER)[0] for m in args.folder_to_device_mapping]
+    invalid_folders = [d for d in folder_list if not os.path.exists(d)]
+    if len(invalid_folders) > 0:
+        error_messages.append(f'Invalid folders in folder_to_device_mapping: {invalid_folders}')
+        no_error = False
+
+    if args.gpu:
+        device_list = [int(m.split(MAPPING_DELIMITER)[1]) for m in args.folder_to_device_mapping]
+        invalid_device_list = [dev_id for dev_id in device_list if not dev_id < get_accelerator().device_count()]
+        if len(invalid_device_list) > 0:
+            error_messages.append(f'Invalid device ids in folder_to_device_mapping: {invalid_device_list}')
+            no_error = False
+
+    return no_error, error_messages
+
+
+def validate_args(args):
+    no_error = True
+    error_messages = []
+
+    if args.folder is not None and len(args.folder_to_device_mapping) > 0:
+        error_messages.append(f'--folder and --folder_to_device_mapping cannot be specified together.')
+        no_error = False
+    elif args.folder is None and len(args.folder_to_device_mapping) == 0:
+        error_messages.append(f'At least one of --folder or --folder_to_device_mapping must be specified.')
+        no_error = False
+
+    # Validate --folder
+    if args.folder is not None and not os.path.exists(args.folder):
+        no_error = False
+        error_messages.append(f'Invalid folder in --folder: {args.folder} ')
+
+    # Validate --folder_mapping_to_device
+    if len(args.folder_to_device_mapping) > 0:
+        no_mapping_error, mapping_error_messages = _validate_folder_mapping(args)
+        no_error = no_error and no_mapping_error
+        error_messages += mapping_error_messages
+
+    # Validate --gpu, --use_gds
+    if args.use_gds and not args.gpu:
+        error_messages.append(f'--gpu must be set to transfer with --use_gds')
+        no_error = False
+
+    if not no_error:
+        print(f'Found {len(error_messages)} validation errors')
+        for i, msg in enumerate(error_messages):
+            print(f'{i+1}: {msg}')
+
+    return no_error
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--folder', default=None, type=str, help='Folder to use for I/O.')
+
+    parser.add_argument('--folder_to_device_mapping',
+                        default=[],
+                        nargs='+',
+                        help='Specification of mapping of folder to (gpu) device id, (ignored for cpu accesses).'
+                        'Can be specified multiple times for multi-process runs,'
+                        'e.g. --folder_to_device_mapping /mnt/nvme0:0 --folder_to_device_mapping /mnt/nvme1:15 --gpu'
+                        'means access /mnt/nvme0 with gpu 0 and /mnt/nvme1 with gpu 15')
+
+    parser.add_argument('--io_size', type=str, default=None, required=True, help='Number of bytes to read or write.')
+
+    parser.add_argument('--read', action='store_true', help='Perform read I/O (default is write)')
+
+    parser.add_argument('--multi_process',
+                        type=int,
+                        default=1,
+                        help='Number of parallel processes doing I/O (default 1).')
+
+    parser.add_argument('--block_size',
+                        type=str,
+                        default='1M',
+                        help='I/O block size. Can use K, M, or G suffix (default 1M for 1 megabytes).')
+
+    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth (default 32).')
+
+    parser.add_argument('--single_submit',
+                        action='store_true',
+                        help='Submit I/O requests in singles (default is submit queue_depth amount at once.).')
+
+    parser.add_argument(
+        '--sequential_requests',
+        action='store_true',
+        help=
+        'Delay I/O request submission until completion of prior requests (default is overlap I/O submission and completion requests.).'
+    )
+
+    parser.add_argument('--validate', action='store_true', help='Perform validation of I/O transfer in library.')
+
+    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
+
+    parser.add_argument('--loops', type=int, default=3, help='Count of operation repetitions')
+
+    parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism')
+
+    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
+
+    parser.add_argument('--use_gds', action='store_true', help='Enable GDS AIO')
+
+    parser.add_argument('--slow_bounce_buffer',
+                        action='store_true',
+                        help='For GPU memory transfers, measure impact of bounce buffer pinning on critical path.')
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def get_validated_args():
+    args = parse_arguments()
+    args = refine_args(args)
+    if not validate_args(args):
+        quit()
+    print(f'Successful validation of command line arguments')
+
+    peer_tag = 'gpu' if args.gpu else 'process'
+    args.mapping_dict = _get_mapping_dict(args)
+    args.mapping_list = [(device_id, folder) for device_id, folder in args.mapping_dict.items()]
+    assert len(args.mapping_dict) == len(args.mapping_list)
+    print(f'Configuring {len(args.mapping_list)} {peer_tag} to folder mapping')
+    for i, (device_id, folder) in enumerate(args.mapping_list):
+        print(f'[{i}]: {peer_tag} {device_id} <----> {folder}')
+
+    return args
diff --git a/deepspeed/nvme/ds_aio_basic.py b/deepspeed/nvme/ds_aio_basic.py
new file mode 100755
index 0000000000000..b346fe9bbfeb8
--- /dev/null
+++ b/deepspeed/nvme/ds_aio_basic.py
@@ -0,0 +1,134 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import torch
+import os
+import time
+from deepspeed.ops.aio import AsyncIOBuilder
+from multiprocessing import Pool, Barrier
+from .test_ds_aio_utils import report_results, task_log, task_barrier
+
+
+def pre_basic(args, tid, read_op):
+    io_string = "Read" if read_op else "Write"
+    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
+    file = args.read_file if read_op else f'{args.write_file}.{tid}'
+
+    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
+    buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
+
+    ctxt = {}
+    ctxt['file'] = file
+    ctxt['num_bytes'] = num_bytes
+    ctxt['buffer'] = buffer
+    ctxt['elapsed_sec'] = 0
+
+    return ctxt
+
+
+def pre_basic_read(pool_params):
+    args, tid = pool_params
+    ctxt = pre_basic(args, tid, True)
+    return ctxt
+
+
+def pre_basic_write(pool_params):
+    args, tid = pool_params
+    ctxt = pre_basic(args, tid, False)
+    return ctxt
+
+
+def post_basic(pool_params):
+    _, _, ctxt = pool_params
+    ctxt["buffer"].detach()
+    ctxt["buffer"] = None
+    return ctxt
+
+
+def main_basic_read(pool_params):
+    args, tid, ctxt = pool_params
+    start_time = time.time()
+    AsyncIOBuilder().load().aio_read(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
+                                     args.single_submit, not args.sequential_requests, args.validate)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_basic_write(pool_params):
+    args, tid, ctxt = pool_params
+    start_time = time.time()
+    AsyncIOBuilder().load().aio_write(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
+                                      args.single_submit, not args.sequential_requests, args.validate)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def get_schedule(args, read_op):
+    schedule = {}
+    if read_op:
+        schedule['pre'] = pre_basic_read
+        schedule['post'] = post_basic
+        schedule['main'] = main_basic_read
+    else:
+        schedule['pre'] = pre_basic_write
+        schedule['post'] = post_basic
+        schedule['main'] = main_basic_write
+
+    return schedule
+
+
+def _aio_handle_tasklet(pool_params):
+    args, tid, read_op = pool_params
+    num_processes = len(args.mapping_dict)
+
+    # Create schedule
+    schedule = get_schedule(args, read_op)
+    task_log(tid, f'schedule = {schedule}')
+    task_barrier(aio_barrier, num_processes)
+
+    # Run pre task
+    task_log(tid, f'running pre-task')
+    ctxt = schedule["pre"]((args, tid))
+    task_barrier(aio_barrier, num_processes)
+
+    # Run main tasks in a loop
+    ctxt["main_task_sec"] = 0
+    for i in range(args.loops):
+        task_log(tid, f'running main task {i}')
+        start_time = time.time()
+        ctxt = schedule["main"]((args, tid, ctxt))
+        task_barrier(aio_barrier, num_processes)
+        stop_time = time.time()
+        ctxt["main_task_sec"] += stop_time - start_time
+
+    # Run post task
+    task_log(tid, f'running post-task')
+    ctxt = schedule["post"]((args, tid, ctxt))
+    task_barrier(aio_barrier, num_processes)
+
+    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
+
+
+def _init_tasklet(b):
+    global aio_barrier
+    aio_barrier = b
+
+
+def aio_basic_multiprocessing(args, read_op):
+    num_processes = len(args.mapping_dict)
+    b = Barrier(num_processes)
+    pool_params = [(args, p, read_op) for p in range(num_processes)]
+    with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p:
+        pool_results = p.map(_aio_handle_tasklet, pool_params)
+
+    report_results(args, read_op, pool_results)
diff --git a/deepspeed/nvme/ds_aio_handle.py b/deepspeed/nvme/ds_aio_handle.py
new file mode 100755
index 0000000000000..47c0cd709ec5a
--- /dev/null
+++ b/deepspeed/nvme/ds_aio_handle.py
@@ -0,0 +1,222 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import torch
+import os
+import time
+from multiprocessing import Pool, Barrier
+from deepspeed.ops.aio import AsyncIOBuilder
+from deepspeed.ops.op_builder import GDSBuilder
+from deepspeed.accelerator import get_accelerator
+from .test_ds_aio_utils import report_results, task_log, task_barrier, create_filename, create_file
+
+BUFFER = 'buffer'
+BOUNCE_BUFFER = 'bounce_buffer'
+
+
+def pre_handle(args, tid, read_op):
+    io_string = "Read" if read_op else "Write"
+    gds = True if args.use_gds else False
+    device_id, folder = args.mapping_list[tid]
+    filename = create_filename(folder, args.read, args.io_size, tid)
+    if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size):
+        create_file(filename, args.io_size)
+
+    task_log(tid, f'Allocate tensor of size {args.io_size} bytes')
+    bounce_buffer = None
+    if args.gpu:
+        device_name = get_accelerator().device_name(device_id)
+        buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=device_name)
+        if not (args.slow_bounce_buffer or gds):
+            bounce_buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8,
+                                          device='cpu').pin_memory()
+    else:
+        buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(tid,
+             f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}',
+             force=True)
+
+    io_parallel = args.io_parallel if args.io_parallel else 1
+    if gds:
+        handle = GDSBuilder().load().gds_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                not args.sequential_requests, io_parallel)
+        handle.pin_device_tensor(buffer)
+    else:
+        handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                    not args.sequential_requests, io_parallel)
+    task_log(tid, f'created deepspeed aio handle')
+
+    ctxt = {}
+    ctxt['file'] = filename
+    ctxt['num_bytes'] = args.io_size
+    ctxt['handle'] = handle
+    ctxt['gds'] = gds
+    ctxt[BUFFER] = buffer
+    ctxt[BOUNCE_BUFFER] = bounce_buffer
+    ctxt['elapsed_sec'] = 0
+
+    return ctxt
+
+
+def pre_handle_read(pool_params):
+    args, tid = pool_params
+    ctxt = pre_handle(args, tid, True)
+    return ctxt
+
+
+def pre_handle_write(pool_params):
+    args, tid = pool_params
+    ctxt = pre_handle(args, tid, False)
+    return ctxt
+
+
+def post_handle(pool_params):
+    _, _, ctxt = pool_params
+    for buf in [BUFFER, BOUNCE_BUFFER]:
+        if ctxt[buf] is not None:
+            if ctxt['gds']:
+                ctxt['handle'].unpin_device_tensor(ctxt[buf])
+            ctxt[buf].detach()
+            ctxt[buf] = None
+    return ctxt
+
+
+def main_parallel_read(pool_params):
+    args, tid, ctxt = pool_params
+    handle = ctxt['handle']
+
+    start_time = time.time()
+    dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
+    ret = handle.pread(ctxt[dest_buffer], ctxt['file'], args.validate, True)
+    assert ret != -1
+    handle.wait()
+    if dest_buffer == BOUNCE_BUFFER:
+        ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+    return ctxt
+
+
+def main_parallel_write(pool_params):
+    args, tid, ctxt = pool_params
+    # Avoid overwriting existing files as it could be artificially faster
+    if os.path.isfile(ctxt['file']):
+        os.remove(ctxt['file'])
+
+    handle = ctxt['handle']
+    start_time = time.time()
+    if ctxt[BOUNCE_BUFFER] is not None:
+        source_buffer = BOUNCE_BUFFER
+        ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data)
+    else:
+        source_buffer = BUFFER
+    ret = handle.pwrite(ctxt[source_buffer], ctxt['file'], args.validate, True)
+    assert ret != -1
+    handle.wait()
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_handle_read(pool_parms):
+    args, tid, ctxt = pool_parms
+    handle = ctxt['handle']
+
+    start_time = time.time()
+    dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
+    ret = handle.read(ctxt[dest_buffer], ctxt['file'], args.validate)
+    assert ret != -1
+    if dest_buffer == BOUNCE_BUFFER:
+        ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_handle_write(pool_parms):
+    args, tid, ctxt = pool_parms
+    # Avoid overwriting existing files as it could be artificially faster
+    if os.path.isfile(ctxt['file']):
+        os.remove(ctxt['file'])
+
+    handle = ctxt['handle']
+    start_time = time.time()
+    if ctxt[BOUNCE_BUFFER] is not None:
+        source_buffer = BOUNCE_BUFFER
+        ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data)
+    else:
+        source_buffer = BUFFER
+    ret = handle.write(ctxt[source_buffer], ctxt['file'], args.validate)
+    assert ret != -1
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def get_schedule(args, read_op):
+    schedule = {}
+    if read_op:
+        schedule['pre'] = pre_handle_read
+        schedule['post'] = post_handle
+        schedule['main'] = main_parallel_read
+    else:
+        schedule['pre'] = pre_handle_write
+        schedule['post'] = post_handle
+        schedule['main'] = main_parallel_write
+
+    return schedule
+
+
+def _aio_handle_tasklet(pool_params):
+    args, tid, read_op = pool_params
+    num_processes = len(args.mapping_dict)
+
+    # Create schedule
+    schedule = get_schedule(args, read_op)
+    task_log(tid, f'schedule = {schedule}')
+    task_barrier(aio_barrier, num_processes)
+
+    # Run pre task
+    task_log(tid, f'running pre-task')
+    ctxt = schedule["pre"]((args, tid))
+    task_barrier(aio_barrier, num_processes)
+
+    # Run main tasks in a loop
+    ctxt["main_task_sec"] = 0
+    for i in range(args.loops):
+        task_log(tid, f'running main task {i}')
+        start_time = time.time()
+        ctxt = schedule["main"]((args, tid, ctxt))
+        task_barrier(aio_barrier, num_processes)
+        stop_time = time.time()
+        ctxt["main_task_sec"] += stop_time - start_time
+
+    # Run post task
+    task_log(tid, f'running post-task')
+    ctxt = schedule["post"]((args, tid, ctxt))
+    task_barrier(aio_barrier, num_processes)
+
+    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
+
+
+def _init_tasklet(b):
+    global aio_barrier
+    aio_barrier = b
+
+
+def aio_handle_multiprocessing(args, read_op):
+    num_processes = len(args.mapping_dict)
+    b = Barrier(num_processes)
+    pool_params = [(args, p, read_op) for p in range(num_processes)]
+    with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p:
+        pool_results = p.map(_aio_handle_tasklet, pool_params)
+
+    report_results(args, read_op, pool_results)
diff --git a/deepspeed/nvme/ds_aio_job.py b/deepspeed/nvme/ds_aio_job.py
new file mode 100644
index 0000000000000..0f9c8b5f1bcc5
--- /dev/null
+++ b/deepspeed/nvme/ds_aio_job.py
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+import subprocess
+import shlex
+
+
+class Job(object):
+
+    def __init__(self, cmd_line, output_file=None, work_dir=None):
+        self.cmd_line = cmd_line
+        self.output_file = output_file
+        self.work_dir = work_dir
+        self.output_fd = None
+
+    def cmd(self):
+        return self.cmd_line
+
+    def get_stdout(self):
+        return self.output_fd
+
+    def get_stderr(self):
+        return self.output_fd
+
+    def get_cwd(self):
+        return self.work_dir
+
+    def open_output_file(self):
+        if self.output_file is not None:
+            self.output_fd = open(self.output_file, 'w')
+
+    def close_output_file(self):
+        if self.output_fd is not None:
+            self.output_fd.close()
+            self.output_fd = None
+
+
+def run_job(job, verbose=False):
+    args = shlex.split(' '.join(job.cmd()))
+    if verbose:
+        print(f'args = {args}')
+    job.open_output_file()
+    proc = subprocess.run(args=args, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
+    job.close_output_file()
+    assert proc.returncode == 0, \
+    f"This command failed: {job.cmd()}"
diff --git a/deepspeed/nvme/parse_nvme_stats.py b/deepspeed/nvme/parse_nvme_stats.py
new file mode 100755
index 0000000000000..09c79ada5b369
--- /dev/null
+++ b/deepspeed/nvme/parse_nvme_stats.py
@@ -0,0 +1,148 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import argparse
+
+READ_SPEED = 'read_speed'
+WRITE_SPEED = 'write_speed'
+
+PERF_METRICS = [READ_SPEED, WRITE_SPEED]
+
+METRIC_SEARCH = {READ_SPEED: 'E2E Read Speed', WRITE_SPEED: 'E2E Write Speed'}
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--log_dir', type=str, required=True, help='Folder of statistics logs')
+
+    parser.add_argument('--metric',
+                        type=str,
+                        required=True,
+                        help='Performance metric to report: [read_speed|write_speed]')
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+
+    return args
+
+
+def extract_value(key, file):
+    INVALID_PREFIXES = ["ds"]
+    for p in INVALID_PREFIXES:
+        if key.startswith(p):
+            return key
+    try:
+        if key[0] in ['t', 'd', 'p']:
+            return int(key[1:])
+        if key.startswith("bs"):
+            if key.endswith('K'):
+                v = key[2:].split('K')
+                return int(v[0]) * 1024
+            elif key.endswith('M'):
+                v = key[2:].split('M')
+                return int(v[0]) * 1024 * 1024
+            else:
+                return int(key[2:])
+    except:
+        print(f"{file}: extract_value fails on {key}")
+        return None
+
+    return key
+
+
+def get_file_key(file):
+    f, _ = os.path.splitext(os.path.basename(file))
+    fields = f.split('_')
+    values = [extract_value(k, file) for k in fields]
+    return tuple(values)
+
+
+def get_thread_count(file):
+    f, _ = os.path.splitext(os.path.basename(file))
+    fields = f.split('_')
+    for key in fields:
+        if key[0] == 't':
+            return int(key[1:])
+    return 1
+
+
+"""
+Extract performance metric from log file.
+Sample file lines are:
+Task Read Latency = 0.031647682189941406 sec
+Task Read Speed = 12.342926020792527 GB/sec
+E2E Read Latency = 0.031697988510131836 sec
+E2E Read Speed = 12.323337169333062 GB/sec
+
+For the above sample, -metric = "read_speed" corresponds to "E2E Read Speed", and 12.32 will be returned
+"""
+
+
+def get_metric(file, metric):
+    thread_count = get_thread_count(file)
+    with open(file) as f:
+        for line in f.readlines():
+            if line.startswith(METRIC_SEARCH[metric]):
+                if metric in [READ_SPEED, WRITE_SPEED]:
+                    fields = line.split()
+                    return float(fields[-2])
+                else:
+                    fields = line.split('=')
+                    return float(fields[-1])
+
+    return None
+
+
+def validate_args(args):
+    if not args.metric in PERF_METRICS:
+        print(f'{args.metric} is not a valid performance metrics')
+        return False
+
+    if not os.path.isdir(args.log_dir):
+        print(f'{args.log_dir} folder is not existent')
+        return False
+
+    return True
+
+
+def get_results(log_files, metric):
+    results = {}
+    for f in log_files:
+        file_key = get_file_key(f)
+        value = get_metric(f, metric)
+        results[file_key] = value
+
+    return results
+
+
+def get_sorted_results(log_dir, metric):
+    log_files = [f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir, f))]
+
+    log_files_path = [os.path.join(log_dir, f) for f in log_files]
+    results = get_results(log_files_path, metric)
+    result_keys = list(results.keys())
+    sorted_keys = sorted(result_keys)
+    return sorted_keys, results
+
+
+def main():
+    print("Parsing aio statistics")
+    args = parse_arguments()
+
+    if not validate_args(args):
+        quit()
+
+    sorted_keys, results = get_sorted_results(args.log_dir, args.metric)
+    for k in sorted_keys:
+        print(f'{k} = {results[k]}')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/deepspeed/nvme/perf_generate_param.py b/deepspeed/nvme/perf_generate_param.py
new file mode 100644
index 0000000000000..d0313d728ad59
--- /dev/null
+++ b/deepspeed/nvme/perf_generate_param.py
@@ -0,0 +1,97 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+import os
+import argparse
+import json
+from .parse_nvme_stats import READ_SPEED, WRITE_SPEED, get_sorted_results
+from .perf_sweep_utils import BENCH_LOG_DIR, READ_LOG_DIR, WRITE_LOG_DIR
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--log_dir',
+                        type=str,
+                        default=BENCH_LOG_DIR,
+                        help=f'Folder of performance sweep logs. Default is {os.path.join(".", BENCH_LOG_DIR)}')
+    parser.add_argument('--verbose', action='store_true', help='Print debugging information.')
+
+    args = parser.parse_args()
+    if args.verbose:
+        print(f'args = {args}')
+
+    return args
+
+
+def validate_args(args):
+    for d in [READ_LOG_DIR, WRITE_LOG_DIR]:
+        log_dir = os.path.join(args.log_dir, d)
+        if not os.path.isdir(log_dir):
+            print(f'{log_dir} folder is not existent')
+            return False
+
+    return True
+
+
+def convert_to_param(key):
+    assert len(key) == 6
+    return {
+        "single_submit": "true" if key[0] == "single" else "false",
+        "overlap_events": "true" if key[1] == "overlap" else "false",
+        "num_threads": int(key[5]),
+        "queue_depth": int(key[3]),
+        "block_size": int(key[4])
+    }
+
+
+def generate_aio_param(read_log_dir, write_log_dir):
+    _, read_results = get_sorted_results(read_log_dir, READ_SPEED)
+    _, write_results = get_sorted_results(write_log_dir, WRITE_SPEED)
+    combined_perf = {key[1:]: value for key, value in read_results.items()}
+
+    for key, value in write_results.items():
+        new_key = key[1:]
+        if new_key in combined_perf:
+            combined_perf[new_key] += value
+        else:
+            combined_perf[new_key] = 0
+
+    optimal_key = None
+    optimal_perf = 0.0
+    for key, value in combined_perf.items():
+        if value > optimal_perf:
+            optimal_perf = value
+            optimal_key = key
+
+    aio_param = {"aio": convert_to_param(optimal_key)}
+
+    read_perf_keys = {key[1:]: key for key in read_results.keys()}
+    write_perf_keys = {key[1:]: key for key in write_results.keys()}
+    optimal_config_read = read_results.get(read_perf_keys[optimal_key], None)
+    optimal_config_write = write_results.get(write_perf_keys[optimal_key], None)
+
+    print(f'Best performance (GB/sec): read = {optimal_config_read:5.2f}, write = {optimal_config_write:5.2f}')
+    print(json.dumps(aio_param, indent=3))
+
+
+def generate_main(log_dir):
+    read_log_dir = os.path.join(log_dir, READ_LOG_DIR)
+    write_log_dir = os.path.join(log_dir, WRITE_LOG_DIR)
+    generate_aio_param(read_log_dir, write_log_dir)
+
+
+def main():
+    args = parse_arguments()
+    if not validate_args(args):
+        quit()
+    print(f'Generate DeepNVMe configuration from {args.log_dir} logs')
+    generate_main(args.log_dir)
+
+
+if __name__ == "__main__":
+    generate_main()
diff --git a/deepspeed/nvme/perf_run_sweep.py b/deepspeed/nvme/perf_run_sweep.py
new file mode 100644
index 0000000000000..0155a4d46caee
--- /dev/null
+++ b/deepspeed/nvme/perf_run_sweep.py
@@ -0,0 +1,320 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+import os
+import sys
+import argparse
+import json
+import itertools
+import shutil
+
+from deepspeed.ops.op_builder import AsyncIOBuilder
+from deepspeed.ops.op_builder import GDSBuilder
+from .ds_aio_job import Job, run_job
+from .perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
+    READ_LOG_DIR, WRITE_LOG_DIR
+
+OTHER_OPTIONS = '--handle'
+PERF_SCRIPT = 'ds_io'
+DEFAULT_SWEEP_CONFIG = {
+    "block_size": ["1M", "8M"],
+    "queue_depth": [32, 128],
+    "sequential_requests": [False],
+    "single_submit": [False],
+    "io_parallel": [1, 8],
+}
+
+
+class SweepConfig(object):
+
+    def __init__(self, args):
+        self.folder_to_device_mapping = get_ftd_map(args.nvme_dir)
+        self.search_space = get_sweep_config_dict(args.sweep_config)
+        self.search_space.update(self.folder_to_device_mapping)
+        self.read = not args.no_read
+        self.write = not args.no_write
+        self.flush_cache = args.flush_page_cache
+        self.log_dir = args.log_dir
+        self.verbose = args.verbose
+        self.other_options = f'{OTHER_OPTIONS} --loops {args.loops} --io_size {args.io_size}'
+        if args.gpu:
+            self.other_options += ' --gpu'
+        if args.gds:
+            self.other_options += ' --use_gds'
+
+
+def validate_arguments(args):
+    if not async_io_setup():
+        error_msg = """
+            Failing because environment is not properly configured for deepspeed async i/o module.
+            Possible fix: apt install libaio-dev.
+        """
+        print(error_msg)
+        quit()
+
+    if args.gds and not gds_io_setup():
+        error_msg = """
+            Failing because environment is not properly configured for deepspeed GDS I/O operator.
+        """
+        print(error_msg)
+        quit()
+
+
+def parse_sweep_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--nvme_dir',
+                        nargs='+',
+                        required=True,
+                        help='Directory in which to perform I/O tests. A writeable directory on a NVMe device.')
+
+    parser.add_argument('--sweep_config', type=str, default=None, help='Performance sweep configuration json file.')
+
+    parser.add_argument('--no_read', action='store_true', help='Disable read performance measurements.')
+
+    parser.add_argument('--no_write', action='store_true', help='Disable write performance measurements.')
+
+    parser.add_argument('--io_size',
+                        type=str,
+                        default="400M",
+                        help='Number of I/O bytes to read/write for performance measurements.')
+
+    parser.add_argument('--gpu', action='store_true', help='Test tensor transfers between GPU device and NVME device.')
+
+    parser.add_argument('--gds', action='store_true', help='Run the sweep over NVIDIA GPUDirectStorage operator')
+
+    parser.add_argument(
+        '--flush_page_cache',
+        action='store_true',
+        help=
+        'Page cache will not be flushed and reported read speeds may be higher than actual ***Requires sudo access***.'
+    )
+
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default=BENCH_LOG_DIR,
+        help=f'Output directory for performance log files. Default is {os.path.join(".", BENCH_LOG_DIR)}')
+
+    parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions')
+
+    parser.add_argument('--verbose', action='store_true', help='Print debugging information.')
+
+    args = parser.parse_args()
+    if args.verbose:
+        print(f'args = {args}')
+    validate_arguments(args)
+
+    return args
+
+
+def dump_cmd_lines(cmd_lines):
+    print(f'cmd line count =  {len(cmd_lines)}')
+    for i, cmd in enumerate(cmd_lines):
+        print(f'{i}:  {cmd}')
+
+
+def get_ftd_map(nvme_dir_list):
+    ftd_list = [f'{dir}:{dev}' for dev, dir in enumerate(nvme_dir_list)]
+    ftd_arg = [' '.join(ftd for ftd in ftd_list)]
+    return {'folder_to_device_mapping': ftd_arg}
+
+
+def get_sweep_config_dict(sweep_config_json):
+    if sweep_config_json is None:
+        return DEFAULT_SWEEP_CONFIG
+
+    with open(sweep_config_json) as fp:
+        sweep_config = json.load(fp)
+    return sweep_config
+
+
+def get_sweep_cmd_lines(sweep_config_dict):
+
+    def flatten_options(key, value_list):
+        flat_list = []
+        for v in value_list:
+            if not type(v) is bool:
+                flat_list.append(f'--{key} {v}')
+            elif v:
+                flat_list.append(f'--{key}')
+            else:
+                flat_list.append(' ')
+
+        return flat_list
+
+    flat_list = [flatten_options(key, value) for key, value in sweep_config_dict.items()]
+    cmd_list = list(itertools.product(*flat_list))
+    cmd_list = [list(cmd) for cmd in cmd_list]
+    #dump_cmd_lines(cmd_list)
+    return cmd_list
+
+
+def launch_sweep(sweep_jobs, sync_job, flush_cache_job, verbose):
+    for perf_job in sweep_jobs:
+        if flush_cache_job is not None:
+            run_job(sync_job, verbose)
+            run_job(flush_cache_job, verbose)
+
+        run_job(perf_job, verbose)
+
+        run_job(sync_job, verbose)
+
+
+def create_cmd_tags(cmd_line):
+    tags = {}
+    for param_value in cmd_line:
+        fields = param_value.split()
+        if len(fields) == 1:
+            tags[fields[0]] = None
+        elif len(fields) == 2:
+            if fields[0] == '--folder_to_device_mapping':
+                tags[fields[0]] = len(fields[1:])
+            else:
+                tags[fields[0]] = fields[1]
+        elif len(fields) > 2:
+            tags[fields[0]] = len(fields[1:])
+    return tags
+
+
+def get_log_file(io_op_desc, cmd_line):
+    QUEUE_DEPTH = "--queue_depth"
+    BLOCK_SIZE = "--block_size"
+    SINGLE_SUBMIT = "--single_submit"
+    SEQUENTIAL_REQUESTS = "--sequential_requests"
+    FTD_MAP = "--folder_to_device_mapping"
+    IO_PARALLEL = "--io_parallel"
+
+    tag_map = {
+        QUEUE_DEPTH: "d",
+        BLOCK_SIZE: "bs",
+        SINGLE_SUBMIT: "single",
+        SEQUENTIAL_REQUESTS: "sequential",
+        FTD_MAP: "ftd",
+        IO_PARALLEL: "p"
+    }
+
+    tag_default = {
+        QUEUE_DEPTH: 1,
+        BLOCK_SIZE: "1M",
+        SINGLE_SUBMIT: "block",
+        SEQUENTIAL_REQUESTS: "overlap",
+        FTD_MAP: 1,
+        IO_PARALLEL: 1
+    }
+
+    def get_default_value(tag):
+        value = tag_default[tag]
+        if tag in [SINGLE_SUBMIT, SEQUENTIAL_REQUESTS]:
+            return value
+        return f'{tag_map[tag]}{value}'
+
+    def get_config_value(tag, value):
+        tag_key = tag_map[tag]
+        if value is None:
+            return tag_key
+        return f'{tag_key}{value}'
+
+    tag_list = [SINGLE_SUBMIT, SEQUENTIAL_REQUESTS, FTD_MAP, QUEUE_DEPTH, BLOCK_SIZE, IO_PARALLEL]
+    log_tags = [io_op_desc]
+    cmd_tags = create_cmd_tags(cmd_line)
+    for tag in tag_list:
+        if tag in cmd_tags:
+            log_tags.append(get_config_value(tag, cmd_tags[tag]))
+        else:
+            log_tags.append(get_default_value(tag))
+
+    log_file = '_'.join(log_tags)
+    log_file += '.txt'
+    return log_file
+
+
+def create_perf_jobs(io_op_desc, log_dir, cmd_lines):
+    py_cmd = [os.path.join(script_path(), PERF_SCRIPT)]
+
+    perf_jobs = []
+    for cmd in cmd_lines:
+        log_file = os.path.join(log_dir, get_log_file(io_op_desc, cmd))
+        job = Job(cmd_line=py_cmd + cmd, output_file=log_file)
+        perf_jobs.append(job)
+
+    return perf_jobs
+
+
+def script_path():
+    return os.path.dirname(os.path.realpath(sys.argv[0]))
+
+
+def async_io_setup():
+    return AsyncIOBuilder().is_compatible()
+
+
+def gds_io_setup():
+    return GDSBuilder().is_compatible()
+
+
+def remove_folder(folder):
+    assert os.path.isdir(folder), f"Error: cannot remove {folder} - folder not found"
+    shutil.rmtree(folder)
+
+
+def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
+    read_cmd_lines = [[f'--read {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
+    # dump_cmd_lines(cmd_lines)
+
+    log_folder = os.path.join(sweep_config.log_dir, f'{READ_LOG_DIR}')
+    os.makedirs(log_folder, exist_ok=True)
+
+    perf_jobs = create_perf_jobs(io_op_desc=READ_OP_DESC, log_dir=log_folder, cmd_lines=read_cmd_lines)
+
+    launch_sweep(sweep_jobs=perf_jobs,
+                 sync_job=sync_job,
+                 flush_cache_job=flush_cache_job,
+                 verbose=sweep_config.verbose)
+
+
+def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
+    write_cmd_lines = [[f'{sweep_config.other_options}'] + cmd for cmd in cmd_lines]
+    # dump_cmd_lines(write_cmd_lines)
+
+    log_folder = os.path.join(sweep_config.log_dir, f'{WRITE_LOG_DIR}')
+    os.makedirs(log_folder, exist_ok=True)
+
+    perf_jobs = create_perf_jobs(io_op_desc=WRITE_OP_DESC, log_dir=log_folder, cmd_lines=write_cmd_lines)
+
+    launch_sweep(sweep_jobs=perf_jobs,
+                 sync_job=sync_job,
+                 flush_cache_job=flush_cache_job,
+                 verbose=sweep_config.verbose)
+
+
+def sweep_main(args):
+    sweep_config = SweepConfig(args)
+    cmd_lines = get_sweep_cmd_lines(sweep_config.search_space)
+
+    if sweep_config.flush_cache:
+        flush_cache_job = Job(cmd_line=['sudo', 'bash -c', "'echo 1 > /proc/sys/vm/drop_caches'"])
+    else:
+        flush_cache_job = None
+
+    sync_job = Job(cmd_line=['sync'])
+
+    if sweep_config.read:
+        run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines)
+
+    if sweep_config.write:
+        run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines)
+
+
+def main():
+    args = parse_sweep_arguments()
+    print(f"Running DeepNVMe performance sweep on {args.nvme_dir}")
+    sweep_main(args)
+
+
+if __name__ == "__main__":
+    sweep_main()
diff --git a/deepspeed/nvme/perf_sweep_utils.py b/deepspeed/nvme/perf_sweep_utils.py
new file mode 100644
index 0000000000000..e6832c1baa492
--- /dev/null
+++ b/deepspeed/nvme/perf_sweep_utils.py
@@ -0,0 +1,13 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+SCRIPT_PREFIX = '_aio_bench'
+WRITE_OP_DESC = 'write'
+READ_OP_DESC = 'read'
+READ_IO_DIR = f'{SCRIPT_PREFIX}_{READ_OP_DESC}_io'
+WRITE_IO_DIR = f'{SCRIPT_PREFIX}_{WRITE_OP_DESC}_io'
+BENCH_LOG_DIR = f'{SCRIPT_PREFIX}_logs'
+READ_LOG_DIR = f'{SCRIPT_PREFIX}_{READ_OP_DESC}_logs'
+WRITE_LOG_DIR = f'{SCRIPT_PREFIX}_{WRITE_OP_DESC}_logs'
diff --git a/deepspeed/nvme/test_ds_aio.py b/deepspeed/nvme/test_ds_aio.py
new file mode 100755
index 0000000000000..a17350414739c
--- /dev/null
+++ b/deepspeed/nvme/test_ds_aio.py
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import multiprocessing as mp
+from .ds_aio_basic import aio_basic_multiprocessing
+from .ds_aio_handle import aio_handle_multiprocessing
+from .ds_aio_args import get_validated_args
+
+
+def ds_io_main():
+    print(f'Testing deepspeed_aio python frontend')
+
+    args = get_validated_args()
+    mp.set_start_method('spawn')
+    multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing
+    multiprocess_function(args, args.read)
+
+
+if __name__ == "__main__":
+    ds_io_main()
diff --git a/deepspeed/nvme/test_ds_aio_utils.py b/deepspeed/nvme/test_ds_aio_utils.py
new file mode 100755
index 0000000000000..cf167f6474603
--- /dev/null
+++ b/deepspeed/nvme/test_ds_aio_utils.py
@@ -0,0 +1,81 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+from .ds_aio_job import Job, run_job
+
+BYTES_PER_GB = 1024**3
+BYTES_PER_MB = 1024**2
+BYTES_PER_KB = 1024
+LOG_TIDS = [0]
+
+
+def task_log(tid, msg, force=False):
+    if force or tid in LOG_TIDS:
+        print(f'tid {tid}: {msg}')
+
+
+def task_barrier(barrier, num_parties):
+    assert barrier.parties == num_parties
+    barrier.wait()
+    assert barrier.broken == False
+
+
+def report_results(args, read_op, pool_results):
+    #print(f'pool_results = {pool_results}')
+    io_string = 'Read' if read_op else 'Write'
+    if None in pool_results:
+        print(f'Failure in one of {args.threads} {io_string} processes')
+        return
+
+    total_bytes = sum([num_bytes for _, _, num_bytes in pool_results])
+
+    task_latency_sec = max([sec for _, sec, _ in pool_results])
+    task_speed_GB = 0 if task_latency_sec == 0 else total_bytes / task_latency_sec / BYTES_PER_GB
+    print(f'Task {io_string} Latency = {task_latency_sec} sec')
+    print(f'Task {io_string} Speed = {task_speed_GB} GB/sec')
+
+    e2e_latency_sec = max([sec for sec, _, _ in pool_results])
+    e2e_speed_GB = 0 if e2e_latency_sec == 0 else total_bytes / e2e_latency_sec / BYTES_PER_GB
+    print(f'E2E {io_string} Latency = {e2e_latency_sec} sec')
+    print(f'E2E {io_string} Speed = {e2e_speed_GB} GB/sec')
+
+
+def get_block_size_and_count(io_bytes):
+    if io_bytes > BYTES_PER_MB and io_bytes % BYTES_PER_MB == 0:
+        block_size = BYTES_PER_MB
+        block_size_string = '1M'
+    else:
+        assert io_bytes % BYTES_PER_KB == 0
+        block_size = BYTES_PER_KB
+        block_size_string = '1K'
+    block_count = io_bytes / block_size
+
+    return block_size_string, int(block_count)
+
+
+def refine_integer_value(value):
+    unit_dict = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
+
+    if value[-1] in list(unit_dict.keys()):
+        int_value = int(value[:-1]) * unit_dict[value[-1]]
+        return int_value
+    return int(value)
+
+
+def create_filename(folder, read_op, size, tid):
+    io_string = "read" if read_op else "write"
+    return os.path.join(folder, f'_aio_{io_string}_{size}.pt.{tid}')
+
+
+def create_file(filename, num_bytes):
+    block_size, block_count = get_block_size_and_count(num_bytes)
+    dd_job = Job(cmd_line=[f'dd if=/dev/urandom of={filename} bs={block_size} count={block_count}'])
+    print(f'[Start] Create {filename} of {num_bytes} bytes by running {dd_job.cmd()} ....')
+    run_job(dd_job)
+    print(f'[Done] Create read file of {num_bytes} bytes by running {dd_job.cmd()} ....')
diff --git a/deepspeed/nvme/validate_async_io.py b/deepspeed/nvme/validate_async_io.py
new file mode 100644
index 0000000000000..10fb638347bcd
--- /dev/null
+++ b/deepspeed/nvme/validate_async_io.py
@@ -0,0 +1,10 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+from deepspeed.ops.op_builder import AsyncIOBuilder
+assert AsyncIOBuilder().is_compatible()
+assert AsyncIOBuilder().load()
diff --git a/deepspeed/ops/transformer/inference/config.py b/deepspeed/ops/transformer/inference/config.py
index 9709328cc1335..c0dd29f4f9627 100644
--- a/deepspeed/ops/transformer/inference/config.py
+++ b/deepspeed/ops/transformer/inference/config.py
@@ -103,7 +103,6 @@ def __init__(self,
         self.return_tuple = return_tuple
         self.mlp_after_attn = mlp_after_attn
         self.mlp_act_func_type = mlp_act_func_type
-        self.specialized_mode = False
         self.training_mp_size = training_mp_size
         self.bigscience_bloom = bigscience_bloom
         self.max_out_tokens = max_out_tokens
diff --git a/deepspeed/ops/transformer/inference/diffusers_attention.py b/deepspeed/ops/transformer/inference/diffusers_attention.py
index 5efc560db75e4..3c2340ccfc6f9 100644
--- a/deepspeed/ops/transformer/inference/diffusers_attention.py
+++ b/deepspeed/ops/transformer/inference/diffusers_attention.py
@@ -10,10 +10,11 @@
 from packaging import version as pkg_version
 from deepspeed.utils.logging import log_dist
 from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer.inference.op_binding.workspace import WorkspaceOp
+from deepspeed.ops.transformer.inference.op_binding.softmax_context import SoftmaxContextOp
+from deepspeed.ops.transformer.inference.op_binding import LinearOp
+from deepspeed.ops.transformer.inference.op_binding.pad_transform import PadTransformOp
 
-# Cuda modules will be imported if needed
-inference_module = None
 minus_inf = -10000.0
 triton_flash_attn = None
 
@@ -36,7 +37,8 @@ class DeepSpeedDiffusersAttentionFunction(Function):
     @staticmethod
     def forward(ctx, input, context, input_mask, config, attn_qkvw, attn_qw, attn_kw, attn_vw, attn_qkvb,
                 num_attention_heads_per_partition, norm_factor, hidden_size_per_partition, attn_ow, attn_ob,
-                do_out_bias, score_context_func, linear_func, triton_flash_attn_kernel, rope_theta):
+                do_out_bias, score_context_func, linear_func, pad_transform_func, triton_flash_attn_kernel,
+                rope_theta):
 
         def _transpose_for_context(x):
             x = x.permute(0, 2, 1, 3)
@@ -77,7 +79,7 @@ def selfAttention_fp(input, context, input_mask):
                     query = query.contiguous()
                     key = key.contiguous()
                     value = value.contiguous()
-                query, key, value = inference_module.pad_transform_fp16(query, key, value, config.heads, do_flash_attn)
+                query, key, value = pad_transform_func(query, key, value, config.heads, do_flash_attn)
                 attention_scores = (torch.matmul(query, key.transpose(-1, -2)) * scale).softmax(dim=-1)
                 context_layer = _transpose_for_context(torch.matmul(attention_scores, value))
 
@@ -117,10 +119,6 @@ def __init__(
 
         data_type = self.config.dtype
         data_type_fp = torch.half if self.config.dtype == torch.int8 else self.config.dtype
-        global inference_module
-        if inference_module is None:
-            builder = InferenceBuilder()
-            inference_module = builder.load()
 
         if DeepSpeedDiffusersAttention.layer_id == 1:
             log_dist(f"DeepSpeed-Attention config: {self.config.__dict__}", [0])
@@ -171,26 +169,24 @@ def __init__(
             self.norm_factor *= math.sqrt(self.config.layer_id + 1)
             # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/gpt2/modeling_gpt2.py#L191
 
-        if self.config.dtype in [torch.float16, torch.int8]:
-            self.score_context_func = inference_module.softmax_context_fp16
-            self.linear_func = inference_module.linear_layer_fp16
-            self.allocate_workspace = inference_module.allocate_workspace_fp16
-        else:
-            self.score_context_func = inference_module.softmax_context_fp32
-            self.linear_func = inference_module.linear_layer_fp32
-            self.allocate_workspace = inference_module.allocate_workspace_fp32
+        self.workspace = WorkspaceOp(self.config)
+        self.score_context_func = SoftmaxContextOp(self.config)
+        self.linear_func = LinearOp(self.config)
+        self.pad_transform_func = PadTransformOp(self.config)
 
-    def forward(self, input, context=None, input_mask=None):
+    def allocate_workspace(self, size):
+        # Allocate memory only on first layer forward
         if self.config.layer_id == 0:
-            self.allocate_workspace(self.config.hidden_size, self.config.heads,
-                                    input.size()[1],
-                                    input.size()[0], DeepSpeedDiffusersAttention.layer_id, self.config.mp_size, False,
-                                    0, self.config.max_out_tokens, self.config.min_out_tokens)
-        output = DeepSpeedDiffusersAttentionFunction.apply(input, context, input_mask, self.config, self.attn_qkvw,
-                                                           self.attn_qw, self.attn_kw, self.attn_vw, self.attn_qkvb,
-                                                           self.num_attention_heads_per_partition, self.norm_factor,
-                                                           self.hidden_size_per_partition, self.attn_ow, self.attn_ob,
-                                                           self.do_out_bias, self.score_context_func, self.linear_func,
-                                                           self.triton_flash_attn_kernel, self.config.rope_theta)
+            self.workspace.allocate_workspace(self.config.hidden_size, self.config.heads, size[1], size[0],
+                                              DeepSpeedDiffusersAttention.layer_id, self.config.mp_size, False, 0,
+                                              self.config.max_out_tokens, self.config.min_out_tokens)
+
+    def forward(self, input, context=None, input_mask=None):
+        self.allocate_workspace(input.size())
+        output = DeepSpeedDiffusersAttentionFunction.apply(
+            input, context, input_mask, self.config, self.attn_qkvw, self.attn_qw, self.attn_kw, self.attn_vw,
+            self.attn_qkvb, self.num_attention_heads_per_partition, self.norm_factor, self.hidden_size_per_partition,
+            self.attn_ow, self.attn_ob, self.do_out_bias, self.score_context_func, self.linear_func,
+            self.pad_transform_func, self.triton_flash_attn_kernel, self.config.rope_theta)
 
         return output
diff --git a/deepspeed/ops/transformer/inference/diffusers_transformer_block.py b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
index b0156f905a06e..d01638f36e401 100644
--- a/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
+++ b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
@@ -10,26 +10,9 @@
 from .diffusers_attention import DeepSpeedDiffusersAttention
 from .bias_add import nhwc_bias_add
 from .diffusers_2d_transformer import Diffusers2DTransformerConfig
-from deepspeed.ops.op_builder import InferenceBuilder, SpatialInferenceBuilder
 from deepspeed.utils.types import ActivationFuncType
-
-# Ops will be loaded on demand
-transformer_cuda_module = None
-spatial_cuda_module = None
-
-
-def load_transformer_module():
-    global transformer_cuda_module
-    if transformer_cuda_module is None:
-        transformer_cuda_module = InferenceBuilder().load()
-    return transformer_cuda_module
-
-
-def load_spatial_module():
-    global spatial_cuda_module
-    if spatial_cuda_module is None:
-        spatial_cuda_module = SpatialInferenceBuilder().load()
-    return spatial_cuda_module
+from .op_binding.gated_activation import GatedActivationOp
+from .op_binding.layer_norm import LayerNormOp
 
 
 class DeepSpeedDiffusersTransformerBlock(nn.Module):
@@ -76,8 +59,8 @@ def __init__(self, equivalent_module: nn.Module, config: Diffusers2DTransformerC
         else:
             self.attn_2_bias = nn.Paramaeter(torch.zeros_like(self.norm3_g), requires_grad=False)
 
-        self.transformer_cuda_module = load_transformer_module()
-        load_spatial_module()
+        self.gated_activation = GatedActivationOp()
+        self.layer_norm = LayerNormOp()
 
     def forward(self, hidden_states, context=None, timestep=None, **kwargs):
         # In v0.12.0 of diffuser, several new kwargs were added. Capturing
@@ -88,17 +71,17 @@ def forward(self, hidden_states, context=None, timestep=None, **kwargs):
         if "encoder_hidden_states" in kwargs and kwargs["encoder_hidden_states"] is not None:
             context = kwargs["encoder_hidden_states"]
 
-        out_norm_1 = self.transformer_cuda_module.layer_norm(hidden_states, self.norm1_g, self.norm1_b, self.norm1_eps)
+        out_norm_1 = self.layer_norm(hidden_states, self.norm1_g, self.norm1_b, self.norm1_eps)
         out_attn_1 = self.attn_1(out_norm_1)
 
-        out_norm_2, out_attn_1 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(
+        out_norm_2, out_attn_1 = self.layer_norm.layer_norm_residual_store_pre_ln_res(
             out_attn_1, self.attn_1_bias, hidden_states, self.norm2_g, self.norm2_b, self.norm2_eps)
         out_attn_2 = self.attn_2(out_norm_2, context=context)
-        out_norm_3, out_attn_2 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(
+        out_norm_3, out_attn_2 = self.layer_norm.layer_norm_residual_store_pre_ln_res(
             out_attn_2, self.attn_2_bias, out_attn_1, self.norm3_g, self.norm3_b, self.norm3_eps)
 
         out_ff1 = nn.functional.linear(out_norm_3, self.ff1_w)
-        out_geglu = self.transformer_cuda_module.gated_activation(out_ff1, self.ff1_b, ActivationFuncType.GATED_GELU)
+        out_geglu = self.gated_activation(out_ff1, self.ff1_b, ActivationFuncType.GATED_GELU)
 
         out_ff2 = nn.functional.linear(out_geglu, self.ff2_w)
         return nhwc_bias_add(out_ff2, self.ff2_b, other=out_attn_2)
diff --git a/deepspeed/ops/transformer/inference/ds_attention.py b/deepspeed/ops/transformer/inference/ds_attention.py
index ffb58175daadd..24f710d224949 100644
--- a/deepspeed/ops/transformer/inference/ds_attention.py
+++ b/deepspeed/ops/transformer/inference/ds_attention.py
@@ -89,7 +89,7 @@ def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count
                 torch.empty(self.hidden_size_per_partition * 3, dtype=data_type_fp, device=device)
             ]
 
-    def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
+    def compute_attention(self, qkv_out, input_mask, layer_past, alibi, is_prompt, token_idx, position_ids):
         if isinstance(qkv_out, list) or isinstance(qkv_out, tuple):
             qkv_out = qkv_out[0]
 
@@ -108,7 +108,10 @@ def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
             no_masking=no_masking,
             layer_id=self.config.layer_id,
             num_layers=DeepSpeedSelfAttention.num_layers,
-            alibi=alibi)
+            alibi=alibi,
+            is_prompt=is_prompt,
+            token_idx=token_idx,
+            position_ids=position_ids)
 
         context_layer, key_layer, value_layer = attn_key_value
         return context_layer, key_layer, value_layer
@@ -136,7 +139,8 @@ def forward(self,
                 output_attentions=False,
                 norm_w=None,
                 norm_b=None,
-                alibi=None):
+                alibi=None,
+                **kwargs):
         if self.attn_qkvw is None:
             self._attn_qkvw, self._attn_qkvb = self._merge_qkv()
         else:
@@ -157,10 +161,17 @@ def forward(self,
                                     gamma=norm_w,
                                     beta=norm_b)
 
+        is_prompt = kwargs.get("first_token", qkv_out[0].shape[1] > 1)
+        token_idx = kwargs.get("token_idx", None)
+        position_ids = kwargs.get("position_ids", None)
+
         context_layer, key_layer, value_layer = self.compute_attention(qkv_out=qkv_out,
                                                                        input_mask=input_mask,
                                                                        layer_past=layer_past,
-                                                                       alibi=alibi)
+                                                                       alibi=alibi,
+                                                                       is_prompt=is_prompt,
+                                                                       token_idx=token_idx,
+                                                                       position_ids=position_ids)
 
         output = self.vector_matmul_func(input=context_layer, weight=self.attn_ow)
         inp_norm = qkv_out[-1]
@@ -210,7 +221,7 @@ def _split_tensor_along_last_dim(self, tensor, num_partitions, contiguous_split_
 
         return tensor_list
 
-    def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
+    def compute_attention(self, qkv_out, input_mask, layer_past, alibi, is_prompt, token_idx, position_ids):
         if isinstance(qkv_out, list) or isinstance(qkv_out, tuple):
             qkv_out = qkv_out[0]
 
diff --git a/deepspeed/ops/transformer/inference/moe_inference.py b/deepspeed/ops/transformer/inference/moe_inference.py
index fc001a86d42e5..3a9785985d196 100644
--- a/deepspeed/ops/transformer/inference/moe_inference.py
+++ b/deepspeed/ops/transformer/inference/moe_inference.py
@@ -7,16 +7,16 @@
 import math
 import torch
 from torch.autograd import Function
-# accelerator modules will be imported if needed
-inference_module = None
-specialized_mode = None
 import torch.nn as nn
 from .ds_attention import DeepSpeedSelfAttention
 from .config import DeepSpeedInferenceConfig
+from .op_binding import SoftmaxOp, VectorMatMulOp, GELUGemmOp
+from .op_binding.bias_residual import BiasResidualOp
+from .op_binding.einsum_sec_sm_ecm import EinsumSecSmEcmOp
+from .op_binding.layer_norm import LayerNormOp
 from ....moe.sharded_moe import TopKGate
 from deepspeed import comm as dist
-from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import InferenceBuilder
+from .op_binding.moe_res_matmul import MoEResMatmulOp
 
 
 class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig):
@@ -110,16 +110,13 @@ class DeepSpeedMLPFunction(Function):
 
     @staticmethod
     def forward(ctx, input, inter_w, inter_b, config, output_b, output_w, q_scales, q_groups, merge_count, mp_group,
-                async_op):
+                async_op, gelu_gemm_func, vector_matmul_func):
         if config.q_int8:
-            intermediate = inference_module.fused_gemm_gelu_int8(input, inter_w, inter_b, config.epsilon, q_scales[2],
-                                                                 (q_groups * (2**merge_count)), config.pre_layer_norm)
-            output = inference_module.vector_matmul_int8(intermediate, output_w, q_scales[3], q_groups, (merge_count))
+            intermediate = gelu_gemm_func(input, inter_w, inter_b, config.epsilon, q_scales[2],
+                                          (q_groups * (2**merge_count)), config.pre_layer_norm)
+            output = vector_matmul_func(intermediate, output_w, q_scales[3], q_groups, (merge_count))
         else:
-            mlp_gemm_func = inference_module.fused_gemm_gelu_fp16 if config.fp16 else \
-                                    inference_module.fused_gemm_gelu_fp32
-
-            output = mlp_gemm_func(input, inter_w, inter_b, output_w, config.epsilon, config.pre_layer_norm, async_op)
+            output = gelu_gemm_func(input, inter_w, inter_b, output_w, config.epsilon, config.pre_layer_norm, async_op)
         if mp_group is not None and dist.get_world_size(group=mp_group) > 1:
             dist.all_reduce(output, group=mp_group, async_op=async_op)
 
@@ -150,10 +147,13 @@ def __init__(self, config, q_scales=None, q_groups=1, merge_count=1, mlp_extra_g
         self.q_groups = q_groups * 2 if mlp_extra_grouping else q_groups
         self.merge_count = int(math.log2(merge_count))
         self.mp_group = mp_group
+        self.gelu_gemm_func = GELUGemmOp(self.config)
+        self.vector_matmul_func = VectorMatMulOp(self.config)
 
     def forward(self, input, async_op=False):
         return DeepSpeedMLPFunction.apply(input, self.inter_w, self.inter_b, self.config, self.output_b, self.output_w,
-                                          self.q_scales, self.q_groups, self.merge_count, self.mp_group, async_op)
+                                          self.q_scales, self.q_groups, self.merge_count, self.mp_group, async_op,
+                                          self.gelu_gemm_func, self.vector_matmul_func)
 
 
 class DeepSpeedMoEInference(nn.Module):
@@ -187,18 +187,7 @@ def __init__(self,
 
         self.config = config
         self.config.layer_id = DeepSpeedMoEInference.layer_id
-        global inference_module
-        global specialized_mode
-        if inference_module is None:
-            specialized_mode = False
-            # InferenceSpecializedBuilder is not among DeepSpeed provided builder yet, so we infer by builder name string
-            builder = get_accelerator().create_op_builder("InferenceSpecializedBuilder")
-            if builder is not None and builder.is_compatible():
-                inference_module = builder.load()
-                specialized_mode = True
-            else:
-                inference_module = InferenceBuilder().load()
-        self.config.specialized_mode = specialized_mode
+
         assert self.config.dtype != torch.bfloat16, "DeepSpeed MoE Transformer Inference not yet tested for bfloat support"
 
         DeepSpeedMoEInference.layer_id += 1
@@ -213,10 +202,8 @@ def __init__(self,
             self.res_mlp = DeepSpeedMoEMLP(config, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping,
                                            mp_group)
             self.res_coef = nn.Parameter(torch.Tensor(self.config.hidden_size, 2))
-            self.coef_func = inference_module.softmax_fp16 if self.config.dtype in [torch.float16, torch.int8] else \
-                                        inference_module.softmax_fp32
-            self.vector_matmul_func = inference_module.vector_matmul_fp16 if self.config.dtype == torch.float16 else \
-                                    inference_module.vector_matmul_fp32
+            self.coef_func = SoftmaxOp(self.config)
+            self.vector_matmul_func = VectorMatMulOp(self.config)
 
         config.mp_size = 1
         self.mlp = nn.ModuleList(
@@ -234,12 +221,10 @@ def __init__(self,
 
         print("DeepSpeed MoE Transformer Inference config is ", self.config.__dict__)
 
-        self.bias_residual_func = inference_module.bias_residual_fp16 if self.config.dtype in [torch.float16, torch.int8] else \
-                                        inference_module.bias_residual_fp32
-        self.ds_layernorm = inference_module.layer_norm_fp16 if self.config.dtype in [torch.float16, torch.int8] else \
-                                        inference_module.layer_norm_fp32
-        self.einsum_sec_sm_ecm = inference_module.einsum_sec_sm_ecm_fp16 if self.config.dtype in [torch.float16, torch.int8] else \
-                                        inference_module.einsum_sec_sm_ecm_fp32
+        self.bias_residual_func = BiasResidualOp(self.config)
+        self.ds_layernorm = LayerNormOp(self.config)
+        self.einsum_sec_sm_ecm = EinsumSecSmEcmOp(self.config)
+        self.moe_res_matmul = MoEResMatmulOp(self.config)
 
     def res_coef_func(self, inp, async_op):
         inp = self.vector_matmul_func(inp, self.res_coef, async_op)
@@ -346,7 +331,7 @@ def forward(self,
                                       dim=0)[dist.get_rank(group=self.expert_mp_group)]
 
             if self.config.mlp_type == 'residual':
-                inference_module.moe_res_matmul(res_mlp_out, res_coef_out, output)
+                self.moe_res_matmul(res_mlp_out, res_coef_out, output)
 
             output = self.bias_residual_func(output, residual_add, torch.empty(1))
 
diff --git a/deepspeed/ops/transformer/inference/op_binding/bias_add.py b/deepspeed/ops/transformer/inference/op_binding/bias_add.py
new file mode 100644
index 0000000000000..d2ae38f546eb4
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/bias_add.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class BiasAddOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(BiasAddOp, self).__init__(config)
+
+        try:
+            if self.config.dtype == torch.float16:
+                self.bias_add_func = self.inference_module.bias_add_fp16
+            elif self.config.dtype == torch.bfloat16:
+                self.bias_add_func = self.inference_module.bias_add_bf16
+            else:
+                self.bias_add_func = self.inference_module.bias_add_fp32
+        except AttributeError:
+            self.bias_add_func = self.bias_add_fallback
+
+    @classmethod
+    def bias_add_fallback(cls, input, bias):
+        return torch.add(input, bias)
+
+    def forward(self, activation: torch.Tensor, bias: torch.Tensor):
+        return self.bias_add_func(activation, bias)
diff --git a/deepspeed/ops/transformer/inference/op_binding/bias_gelu.py b/deepspeed/ops/transformer/inference/op_binding/bias_gelu.py
new file mode 100644
index 0000000000000..f0fee0b0d06ea
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/bias_gelu.py
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn.functional as F
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class BiasGeluOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(BiasGeluOp, self).__init__(config)
+
+        try:
+            if self.config.dtype == torch.float16:
+                self.bias_gelu_func = self.inference_module.bias_gelu_fp16
+            elif self.config.dtype == torch.bfloat16:
+                self.bias_gelu_func = self.inference_module.bias_gelu_bf16
+            else:
+                self.bias_gelu_func = self.inference_module.bias_gelu_fp32
+        except AttributeError:
+            self.bias_gelu_func = self.bias_gelu_fallback
+
+    @classmethod
+    def bias_gelu_fallback(cls, activations, bias):
+        # Expected behavior is that of casting to float32 internally and using the tanh approximation
+        return F.gelu(activations.to(torch.float32) + bias.to(torch.float32), approximate='tanh').to(activations.dtype)
+
+    def forward(self, activation: torch.Tensor, bias: torch.Tensor):
+        return self.bias_gelu_func(activation, bias)
diff --git a/deepspeed/ops/transformer/inference/op_binding/bias_relu.py b/deepspeed/ops/transformer/inference/op_binding/bias_relu.py
new file mode 100644
index 0000000000000..ccfade1d95245
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/bias_relu.py
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn.functional as F
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class BiasReluOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(BiasReluOp, self).__init__(config)
+
+        try:
+            if self.config.dtype == torch.float16:
+                self.bias_relu_func = self.inference_module.bias_relu_fp16
+            elif self.config.dtype == torch.bfloat16:
+                self.bias_relu_func = self.inference_module.bias_relu_bf16
+            else:
+                self.bias_relu_func = self.inference_module.bias_relu_fp32
+        except AttributeError:
+            self.bias_relu_func = self.bias_relu_fallback
+
+    @classmethod
+    def bias_relu_fallback(cls, activations, bias):
+        # Expected behavior is that of casting to float32 internally
+        return F.relu(activations.to(torch.float32) + bias.to(torch.float32)).to(activations.dtype)
+
+    def forward(self, activation: torch.Tensor, bias: torch.Tensor):
+        return self.bias_relu_func(activation, bias)
diff --git a/deepspeed/ops/transformer/inference/op_binding/bias_residual.py b/deepspeed/ops/transformer/inference/op_binding/bias_residual.py
new file mode 100644
index 0000000000000..ecad50e10ffef
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/bias_residual.py
@@ -0,0 +1,29 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class BiasResidualOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(BiasResidualOp, self).__init__(config)
+
+        try:
+            if self.config.dtype in [torch.float16, torch.int8]:
+                self.bias_residual_func = self.inference_module.bias_residual_fp16
+            else:
+                self.bias_residual_func = self.inference_module.bias_residual_fp32
+        except AttributeError:
+            self.bias_residual_func = self.bias_residual_fallback
+
+    @classmethod
+    def bias_residual_fallback(cls, output, residual, bias):
+        raise NotImplementedError("bias residual fallback isn't implemented")
+
+    def forward(self, output, residual, bias):
+        return self.bias_residual_func(output, residual, bias)
diff --git a/deepspeed/ops/transformer/inference/op_binding/einsum_sec_sm_ecm.py b/deepspeed/ops/transformer/inference/op_binding/einsum_sec_sm_ecm.py
new file mode 100644
index 0000000000000..f34b10f786d11
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/einsum_sec_sm_ecm.py
@@ -0,0 +1,29 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class EinsumSecSmEcmOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(EinsumSecSmEcmOp, self).__init__(config)
+
+        try:
+            if self.config.dtype in [torch.float16, torch.int8]:
+                self.einsum_sec_sm_ecm_func = self.inference_module.einsum_sec_sm_ecm_fp16
+            else:
+                self.einsum_sec_sm_ecm_func = self.inference_module.einsum_sec_sm_ecm_fp32
+        except AttributeError:
+            self.einsum_sec_sm_ecm_func = self.einsum_sec_sm_ecm_fallback
+
+    @classmethod
+    def einsum_sec_sm_ecm_fallback(cls, Q, W):
+        raise NotImplementedError("einsum sec sm ecm fallback isn't implemented")
+
+    def forward(self, Q, W):
+        return self.einsum_sec_sm_ecm_func(Q, W)
diff --git a/deepspeed/ops/transformer/inference/op_binding/gated_activation.py b/deepspeed/ops/transformer/inference/op_binding/gated_activation.py
new file mode 100644
index 0000000000000..d28d818ce4b33
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/gated_activation.py
@@ -0,0 +1,40 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn.functional as F
+from deepspeed.utils.types import ActivationFuncType
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class GatedActivationOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(GatedActivationOp, self).__init__(config)
+        try:
+            self.gated_activation_func = self.inference_module.gated_activation
+        except AttributeError:
+            self.gated_activation_func = self.gated_activation_fallback
+
+    @classmethod
+    def gated_activation_fallback(cls, activation, bias, activation_func_type):
+        # Expected behavior is that of casting to float32 internally
+        # Explicitly using the default GeLU
+        activation_func = None
+        activations = activation + bias.reshape(1, 1, -1)
+        hidden_states, gate = activations.chunk(2, dim=-1)
+
+        if activation_func_type == ActivationFuncType.GATED_SILU:
+            activation_func = F.silu
+        elif activation_func_type == ActivationFuncType.GATED_GELU:
+            activation_func = F.gelu
+
+        return hidden_states * activation_func(gate.to(torch.float32)).to(activations.dtype)
+
+    def forward(self, activation: torch.Tensor, bias: torch.Tensor, activation_func_type: ActivationFuncType):
+        return self.gated_activation_func(activation, bias, activation_func_type)
diff --git a/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py b/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
index 63323c150752b..60bbb4b48bdb3 100644
--- a/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
@@ -4,6 +4,7 @@
 # DeepSpeed Team
 
 import torch
+import torch.nn.functional as F
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
 import deepspeed
@@ -14,7 +15,9 @@ class GELUGemmOp(BaseOp):
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(GELUGemmOp, self).__init__(config)
         try:
-            if self.config.dtype in [torch.float16, torch.int8]:
+            if self.config.dtype == torch.int8:
+                self.fused_gemm_gelu = self.inference_module.fused_gemm_gelu_int8
+            elif self.config.dtype == torch.float16:
                 if deepspeed.HAS_TRITON and self.config.use_triton and self.config.dtype == torch.float16:
                     from deepspeed.ops.transformer.inference.triton.ops import fused_gemm_gelu as _triton_fused_gemm_gelu
                     self.fused_gemm_gelu = _triton_fused_gemm_gelu  # type: ignore
@@ -28,7 +31,11 @@ def __init__(self, config: DeepSpeedInferenceConfig):
             self.fused_gemm_gelu = self.gelu_gemm_fallback
 
     def gelu_gemm_fallback(self, input, weight, scale, bias, out, out_scale, dtype, transpose):
-        raise NotImplementedError
+        tmp = torch.matmul(input, weight)
+        tmp = F.gelu(tmp.to(torch.float32) + bias.to(torch.float32), approximate="tanh").to(tmp.dtype)
+        output = torch.matmul(tmp, out)
+
+        return output
 
     def forward(self, input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, weight_out: torch.Tensor):
 
diff --git a/deepspeed/ops/transformer/inference/op_binding/layer_norm.py b/deepspeed/ops/transformer/inference/op_binding/layer_norm.py
new file mode 100644
index 0000000000000..31219a58ac3c8
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/layer_norm.py
@@ -0,0 +1,60 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn.functional as F
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class LayerNormOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        super(LayerNormOp, self).__init__(config)
+        try:
+            if config is None:
+                self.layer_norm_func = self.inference_module.layer_norm
+            elif self.config.dtype in [torch.float16, torch.int8]:
+                self.layer_norm_func = self.inference_module.layer_norm_fp16
+            else:
+                self.layer_norm_func = self.inference_module.layer_norm_fp32
+        except AttributeError:
+            self.layer_norm_func = self.layer_norm_fallback
+
+    @classmethod
+    def layer_norm_residual(cls, vals, bias, res, gamma, beta, epsilon):
+        channels = gamma.shape[0]
+        dtype = gamma.dtype
+        vals_f = vals.to(torch.float32)
+        bias_f = bias.to(torch.float32).reshape(1, 1, -1)
+        res_f = res.to(torch.float32)
+        gamma_f = gamma.to(torch.float32)
+        beta_f = beta.to(torch.float32)
+        return F.layer_norm(vals_f + bias_f + res_f, (channels, ), weight=gamma_f, bias=beta_f, eps=epsilon).to(dtype)
+
+    @classmethod
+    def layer_norm_residual_store_pre_ln_res(cls, vals, bias, res, gamma, beta, epsilon):
+        channels = gamma.shape[0]
+        dtype = gamma.dtype
+        vals_f = vals.to(torch.float32)
+        bias_f = bias.to(torch.float32).reshape(1, 1, -1)
+        res_f = res.to(torch.float32)
+        gamma_f = gamma.to(torch.float32)
+        beta_f = beta.to(torch.float32)
+        res_output = vals_f + bias_f + res_f
+        norm_output = F.layer_norm(res_output, (channels, ), weight=gamma_f, bias=beta_f, eps=epsilon).to(dtype)
+        return norm_output, res_output.to(dtype)
+
+    @classmethod
+    def layer_norm_fallback(cls, vals, gamma, beta, epsilon):
+        channels = gamma.shape[0]
+        dtype = gamma.dtype
+        vals_f = vals.to(torch.float32)
+        gamma_f = gamma.to(torch.float32)
+        beta_f = beta.to(torch.float32)
+        return F.layer_norm(vals_f, (channels, ), weight=gamma_f, bias=beta_f, eps=epsilon).to(dtype)
+
+    def forward(self, vals, gamma, beta, epsilon):
+        return self.layer_norm_func(vals, gamma, beta, epsilon)
diff --git a/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
index 3064c00d1755d..5f1f915ec0216 100644
--- a/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
@@ -5,12 +5,12 @@
 
 from typing import Optional
 
-import os
 import torch
 import torch.nn.functional as F
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
 from deepspeed.utils.types import NormType
+from .pre_rms_norm import PreRMSNormOp
 
 
 class MLPGemmOp(BaseOp):
@@ -39,23 +39,45 @@ def __init__(self, config: DeepSpeedInferenceConfig):
                 self.mlp_gemm_func = self.mlp_gemm_fallback
             elif self.config.norm_type == NormType.RMSNorm:
                 self.mlp_gemm_func = self.rms_mlp_gemm_fallback
+        self.pre_rms_norm = PreRMSNormOp()
 
     def mlp_gemm_fallback(self, input, residual, input_bias, weight_interm, weight_out, bias, gamma, beta, eps,
                           pre_layer_norm, mlp_after_attn, interm_scale, out_scale, dtype, mlp_act_func_type,
                           transpose):
-        if os.environ.get('DS_KI_FALLBACK') == 'True' and mlp_after_attn and not transpose:
-            residual_add = F.layer_norm(input + residual + input_bias, (input.shape[2], ), gamma, beta,
-                                        self.config.epsilon)
-            tmp = torch.matmul(residual_add, weight_interm)
+        if mlp_after_attn:
+            residual_add = F.layer_norm(input + residual + input_bias, (input.shape[2], ), gamma, beta, eps)
+            tmp = torch.matmul(residual_add, weight_interm.t() if transpose else weight_interm)
             tmp = F.gelu(tmp + bias)
-            output = torch.matmul(tmp, weight_out)
-            return (output, residual_add)
+            output = torch.matmul(tmp, weight_out.t() if transpose else weight_out)
+
+            return output, residual_add
         else:
             raise NotImplementedError
 
     def rms_mlp_gemm_fallback(self, input, residual, weight_interm, weight_out, gamma, eps, interm_scale, out_scale,
                               dtype, mlp_act_func_type, transpose):
-        raise NotImplementedError
+        inp_norm, residual = self.pre_rms_norm(input, residual, gamma, eps)
+        tmp = torch.matmul(inp_norm.view([-1, inp_norm.size(2)]), weight_interm.t() if transpose else weight_interm)
+        up_proj, gate_proj = tmp.chunk(2, dim=1)
+
+        from deepspeed.utils.types import ActivationFuncType
+        if mlp_act_func_type == ActivationFuncType.GELU:
+            intermediate = F.gelu(gate_proj)
+        elif mlp_act_func_type == ActivationFuncType.ReLU:
+            intermediate = F.relu(gate_proj)
+        elif mlp_act_func_type == ActivationFuncType.GATED_GELU:
+            intermediate = F.gelu(gate_proj)
+        elif mlp_act_func_type == ActivationFuncType.GATED_SILU:
+            intermediate = F.silu(gate_proj)
+        else:
+            raise f"rms_mlp_gemm_fallback not implemented for activation type {mlp_act_func_type}"
+
+        intermediate = intermediate * up_proj
+
+        output = torch.matmul(intermediate, weight_out.t() if transpose else weight_out)
+        output = output.view([input.size(0), input.size(1), -1])
+
+        return [output, residual]
 
     def forward(self,
                 input: torch.Tensor,
diff --git a/deepspeed/ops/transformer/inference/op_binding/moe_res_matmul.py b/deepspeed/ops/transformer/inference/op_binding/moe_res_matmul.py
new file mode 100644
index 0000000000000..ef3558c8bc889
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/moe_res_matmul.py
@@ -0,0 +1,29 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class MoEResMatmulOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(MoEResMatmulOp, self).__init__(config)
+        try:
+            self.moe_res_matmul_func = self.inference_module.moe_res_matmul
+        except AttributeError:
+            self.moe_res_matmul_func = self.moe_res_matmul_fallback
+
+    @classmethod
+    def moe_res_matmul_fallback(cls, residual, coef, output):
+        coef_t = coef.transpose(1, 2).contiguous()
+        coef1, coef2 = torch.split(coef_t, split_size_or_sections=coef_t.shape[len(coef_t.shape) - 1] // 2, dim=-1)
+        return residual * coef1 + output * coef2
+
+    def forward(self, residual, coef, output):
+        return self.moe_res_matmul_func(residual, coef, output)
diff --git a/deepspeed/ops/transformer/inference/op_binding/pad_transform.py b/deepspeed/ops/transformer/inference/op_binding/pad_transform.py
new file mode 100644
index 0000000000000..876fefc3bcfbd
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/pad_transform.py
@@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class PadTransformOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(PadTransformOp, self).__init__(config)
+        try:
+            self.pad_transform_func = self.inference_module.pad_transform_fp16
+        except AttributeError:
+            self.pad_transform_func = self.pad_transform_fallback
+
+    @staticmethod
+    def pad_transform_fallback(query, key, value, heads, do_flash_attn):
+        raise NotImplementedError("pad_transform fallback is not implemented.")
+
+    def forward(self, query, key, value, heads, do_flash_attn):
+        return self.pad_transform_func(query, key, value, heads, do_flash_attn)
diff --git a/deepspeed/ops/transformer/inference/op_binding/pre_rms_norm.py b/deepspeed/ops/transformer/inference/op_binding/pre_rms_norm.py
new file mode 100644
index 0000000000000..7969d20f0527b
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/pre_rms_norm.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+from .rms_norm import RMSNormOp
+
+
+class PreRMSNormOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(PreRMSNormOp, self).__init__(config)
+        try:
+            self.pre_rms_norm_func = self.inference_module.pre_rms_norm
+        except AttributeError:
+            self.pre_rms_norm_func = self.pre_rms_norm_fallback
+
+    @staticmethod
+    def pre_rms_norm_fallback(vals, residual, gamma, epsilon):
+        residual = vals.to(torch.float32) + residual.to(torch.float32)
+        vals = residual
+
+        return RMSNormOp.rms_norm_fallback(vals, gamma, epsilon), residual.to(gamma.dtype)
+
+    def forward(self, vals, residual, gamma, epsilon):
+        return self.pre_rms_norm_func(vals, residual, gamma, epsilon)
diff --git a/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
index 250bf9864e1e7..9ff5366fae5d6 100644
--- a/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
@@ -3,11 +3,11 @@
 
 # DeepSpeed Team
 
-import os
 import torch
 import torch.nn.functional as F
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
+from .rms_norm import RMSNormOp
 import deepspeed
 from deepspeed.utils.types import NormType
 
@@ -56,19 +56,23 @@ def _triton_autotune(min_seqlen, max_seqlen, hidden_size, dtype=torch.float16):
             matmul(A, B)
         Fp16Matmul._update_autotune_table()
 
-    def qkv_gemm_fallback(self, input, weight, q_scale, bias, gamma, beta, eps, add_bias, q_int8, transpose):
-        if os.environ.get('DS_KI_FALLBACK') == 'True' and not transpose:
-            inp_norm = F.layer_norm(input, (input.shape[2], ), gamma, beta, eps)
-            tmp = torch.matmul(inp_norm, weight)
-            if add_bias:
-                tmp += bias
-            output = [tmp, inp_norm]
-            return output
-        else:
-            raise NotImplementedError
+    @staticmethod
+    def qkv_gemm_fallback(input, weight, q_scale, bias, gamma, beta, eps, add_bias, q_int8, transpose):
+        inp_norm = F.layer_norm(input, (input.shape[2], ), gamma, beta, eps)
+        tmp = torch.matmul(inp_norm, weight.t() if transpose else weight)
+        if add_bias:
+            tmp += bias
+        output = [tmp, inp_norm]
+
+        return output
+
+    @staticmethod
+    def rms_qkv_gemm_fallback(input, weight, q_scale, gamma, eps, q_int8, transpose):
+        inp_norm = RMSNormOp.rms_norm_fallback(input, gamma, eps)
+        tmp = torch.matmul(inp_norm, weight.t() if transpose else weight)
+        output = [tmp, inp_norm]
 
-    def rms_qkv_gemm_fallback(self, input, weight, q_scale, gamma, eps, q_int8, transpose):
-        raise NotImplementedError
+        return output
 
     def forward(self, input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, gamma: torch.Tensor,
                 beta: torch.Tensor):
diff --git a/deepspeed/ops/transformer/inference/op_binding/residual_add.py b/deepspeed/ops/transformer/inference/op_binding/residual_add.py
index 6f9b35cbc05d7..93b229c5d1ac0 100644
--- a/deepspeed/ops/transformer/inference/op_binding/residual_add.py
+++ b/deepspeed/ops/transformer/inference/op_binding/residual_add.py
@@ -3,9 +3,10 @@
 
 # DeepSpeed Team
 
-import os
 import torch
 from typing import Optional
+
+from .vector_add import VectorAddOp
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
 
@@ -22,11 +23,32 @@ def __init__(self, config: DeepSpeedInferenceConfig):
             else:
                 self.residual_add_func = self.inference_module.residual_add_bias_fp32
         except AttributeError:
-            self.residual_add_func = None
-        try:
-            self._vector_add = self.inference_module._vector_add
-        except AttributeError:
-            self._vector_add = None
+            self.residual_add_func = self.residual_add_fallback
+        self.vector_add = VectorAddOp()
+
+    @staticmethod
+    def res_add_bias(hidden_state, residual, attn_output, attn_bias, final_bias, add_attn_bias, mp_size):
+        hidden_state += attn_output + (residual + final_bias) / mp_size
+        if add_attn_bias:
+            hidden_state += attn_bias / mp_size
+
+        return hidden_state
+
+    @staticmethod
+    def residual_add_fallback(hidden_state, residual, attention_output, attention_bias, final_bias, mp_size,
+                              mlp_after_attn, add_bias, pre_layer_norm):
+        if mlp_after_attn:
+            if pre_layer_norm:
+                tmp = (residual.float() + attention_output.float() + attention_bias.float() +
+                       final_bias.float()) / mp_size + hidden_state.float()
+            else:
+                tmp = residual.float() + hidden_state.float() + final_bias.float()
+        else:
+            tmp = ResidualAddOp.res_add_bias(hidden_state, residual, attention_output, attention_bias, final_bias,
+                                             add_bias, mp_size)
+        residual.copy_(tmp.to(hidden_state.dtype))
+
+        return residual
 
     def forward(self,
                 hidden_state: torch.Tensor,
@@ -37,28 +59,15 @@ def forward(self,
                 attention_bias: Optional[torch.Tensor] = None,
                 final_bias: Optional[torch.Tensor] = None):
 
-        if self.residual_add_func is not None:
-            if final_bias is None:
-                residual = self._vector_add(residual, hidden_state, 1.0 / self.config.mp_size)
-            else:
-                if not self.config.pre_layer_norm and residual_add is not None:
-                    # only use residual add if its set and we are not pre layer norm
-                    residual = residual_add
-
-                self.residual_add_func(hidden_state, residual, attention_output, attention_bias, final_bias,
-                                       self.config.mp_size, self.config.mlp_after_attn, add_bias,
-                                       self.config.pre_layer_norm)
+        if final_bias is None and attention_bias is None:
+            residual = self.vector_add(residual + attention_output, hidden_state, 1.0 / self.config.mp_size)
         else:
-            # fallback
-            if os.environ.get('DS_KI_FALLBACK') == 'True' and self.config.mlp_after_attn:
-                if self.config.pre_layer_norm:
-                    tmp = (residual.float() + attention_output.float() + attention_bias.float() +
-                           final_bias.float()) / self.config.mp_size + hidden_state.float()
-                else:
-                    tmp = residual.float() + hidden_state.float() + final_bias.float()
+            if not self.config.pre_layer_norm and residual_add is not None:
+                # only use residual add if its set and we are not pre layer norm
+                residual = residual_add
+
+            self.residual_add_func(hidden_state, residual, attention_output, attention_bias, final_bias,
+                                   self.config.mp_size, self.config.mlp_after_attn, add_bias,
+                                   self.config.pre_layer_norm)
 
-                input_dtype = hidden_state.dtype
-                residual = tmp.to(input_dtype)
-            else:
-                raise NotImplementedError
         return residual
diff --git a/deepspeed/ops/transformer/inference/op_binding/rms_norm.py b/deepspeed/ops/transformer/inference/op_binding/rms_norm.py
new file mode 100644
index 0000000000000..128883ce5d438
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/rms_norm.py
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class RMSNormOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(RMSNormOp, self).__init__(config)
+        try:
+            self.rms_norm_func = self.inference_module.rms_norm
+        except AttributeError:
+            self.rms_norm_func = self.rms_norm_fallback
+
+    @staticmethod
+    def rms_norm_fallback(vals, gamma, epsilon):
+        variance = vals.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        vals = vals * torch.rsqrt(variance + epsilon)
+
+        if gamma.dtype in [torch.float16, torch.bfloat16]:
+            vals = vals.to(gamma.dtype)
+
+        return gamma * vals
+
+    def forward(self, vals, gamma, epsilon):
+        return self.rms_norm_func(vals, gamma, epsilon)
diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax.py b/deepspeed/ops/transformer/inference/op_binding/softmax.py
index bc309d94df147..2e08541596fa8 100644
--- a/deepspeed/ops/transformer/inference/op_binding/softmax.py
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax.py
@@ -3,11 +3,11 @@
 
 # DeepSpeed Team
 
-import os
 import torch
 import torch.nn.functional as F
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
+from deepspeed.ops.transformer.inference.op_binding.workspace import InferenceContext
 
 
 class SoftmaxOp(BaseOp):
@@ -25,24 +25,42 @@ def __init__(self, config: DeepSpeedInferenceConfig):
         except AttributeError:
             self.softmax_func = self.softmax_fallback
 
-    def softmax_fallback(self, attn_scores, attn_mask, alibi, triangular, recompute, local_attention, window_size,
-                         async_op, layer_scale, head_offset, mp_size):
-        if os.environ.get('DS_KI_FALLBACK') == 'True':
-            alibi = alibi[head_offset:head_offset + self.num_attention_heads_per_partition]
-            input_dtype = attn_scores.dtype
-            if (triangular):
-                tri = ~torch.tril(torch.ones(attn_scores.size(), device=attn_scores.device)).to(bool)
-                attn_scores = torch.masked_fill(attn_scores * layer_scale, tri, torch.finfo(input_dtype).min)
-            if alibi is not None:
-                attn_scores += alibi
-            if attn_mask is not None:
-                # expand atten_mask from two dim into 4 dim, insert two dims in the middle
+    @staticmethod
+    def softmax_fallback(attn_scores, attn_mask, alibi, triangular, recompute, local_attention, window_size, async_op,
+                         layer_scale, head_offset, mp_size):
+        scores_len = len(attn_scores.size())
+        heads = 1
+        if scores_len > 1:
+            heads = attn_scores.size()[1]
+        num_attention_heads_per_partition = heads // mp_size
+
+        if alibi is not None:
+            if len(alibi.shape) == 1:
+                alibi = None
+            else:
+                alibi = alibi[head_offset:head_offset + num_attention_heads_per_partition]
+        if attn_mask is not None and len(attn_mask.shape) == 1:
+            attn_mask = None
+        input_dtype = attn_scores.dtype
+        attn_scores *= layer_scale
+
+        if alibi is not None:
+            attn_scores += alibi
+        if attn_mask is not None:
+            # expand atten_mask from two dim into 4 dim, insert two dims in the middle
+            if len(attn_mask.shape) == 2:
                 attn_mask = attn_mask[:, None, None, :]
-                attn_scores += attn_mask
-            output = F.softmax(attn_scores, dim=-1, dtype=torch.float32).to(input_dtype)
-            return output
-        else:
-            raise NotImplementedError
+            attn_scores += attn_mask
+        if triangular:
+            if attn_scores.shape[2] == 1:  # query using kv cache
+                token_idx = InferenceContext.Instance().current_tokens()
+                tri = torch.arange(attn_scores.shape[2], device=attn_scores.device).ge(token_idx)
+            else:
+                tri = ~torch.tril(torch.ones(attn_scores.size(), device=attn_scores.device)).to(bool)
+            attn_scores = torch.masked_fill(attn_scores, tri, float('-inf'))
+        output = F.softmax(attn_scores, dim=-1, dtype=torch.float32).to(input_dtype)
+
+        return output
 
     def forward(self, attn_scores: torch.Tensor, attn_mask: torch.Tensor, alibi: torch.Tensor, triangular: bool,
                 recompute: bool, local_attention: bool, window_size: int, async_op: bool, layer_scale: float,
diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
index 0dc4e08a36335..d745df678e93b 100644
--- a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
@@ -7,6 +7,8 @@
 from deepspeed import comm as dist
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
+from .softmax import SoftmaxOp
+from deepspeed.ops.transformer.inference.op_binding.workspace import InferenceContext
 
 
 class SoftmaxContextOp(BaseOp):
@@ -23,13 +25,108 @@ def __init__(self, config: DeepSpeedInferenceConfig):
         except AttributeError:
             self.softmax_context_func = self.softmax_context_fallback
 
+    @staticmethod
+    def transform4d_0213(x, seq_length):
+        assert x.dim() == 3, F"Dim {x.dim()} is not supported"
+        batch_size, num_heads, seq_length_head_dim = x.shape
+        head_dim = seq_length_head_dim // seq_length
+        x = x.view(batch_size, num_heads, seq_length, head_dim)
+        x = x.permute(0, 2, 1, 3)
+
+        return x
+
+    @staticmethod
+    def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+        batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+        if n_rep <= 1 or num_key_value_heads == 1:
+            return hidden_states
+
+        hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+
+        return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+    @staticmethod
+    def bias_add_transform_0213(input, bias, num_heads, trans_count, perform_bias=False):
+        assert trans_count == 1 or trans_count == 3, F"Trans count {trans_count} is not supported"
+        assert input.dim() == 3, F"Dim {input.dim()} is not supported"
+        input_biased = torch.add(input, bias) if perform_bias else input
+        batch_size, seq_length, value_size = input_biased.shape
+        hid_dim = value_size // trans_count
+        head_dim = hid_dim // num_heads
+
+        if trans_count == 1:
+            query_layer = input.view(batch_size, seq_length, num_heads, head_dim)
+            query_layer = query_layer.permute(0, 2, 1, 3)
+            key_layer = torch.zeros_like(query_layer)
+            value_layer = torch.zeros_like(query_layer)
+            return query_layer, key_layer, value_layer
+
+        qkv_layers = input.view(batch_size, seq_length, 3, num_heads, head_dim)
+        query_layer, key_layer, value_layer = qkv_layers[..., 0, :, :], qkv_layers[..., 1, :, :], qkv_layers[...,
+                                                                                                             2, :, :]
+        query_layer = query_layer.transpose(1, 2)
+        key_layer = key_layer.transpose(1, 2)
+        value_layer = value_layer.transpose(1, 2)
+
+        return query_layer, key_layer, value_layer
+
     def softmax_context_fallback(self, query_key_value, attn_mask, rotary_dim, rotate_half, rotate_every_two, heads,
                                  num_kv, norm_factor, triangular_masking, local_attention, window_size, no_masking,
-                                 layer_id, num_layers, alibi, rope_theta):
-        raise NotImplementedError
+                                 layer_id, num_layers, alibi, rope_theta, is_prompt, token_idx, position_ids):
+        bat_0213_query, bat_0213_key, bat_0213_value = self.bias_add_transform_0213(
+            query_key_value, None, heads, 3, False)
+
+        if rotary_dim > 0 and rotate_half:
+            from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+
+            rotary = InferenceContext.Instance().get_rotary(rotary_dim, rope_theta, bat_0213_value.device)
+            cos, sin = rotary(bat_0213_value, InferenceContext.Instance().get_max_tokens_num())
+            bat_0213_query, bat_0213_key = apply_rotary_pos_emb(bat_0213_query, bat_0213_key, cos, sin, position_ids)
+
+        bat_0213_key, bat_0213_value = InferenceContext.Instance().update_cache(layer_id, token_idx, is_prompt,
+                                                                                bat_0213_key, bat_0213_value)
+
+        bat_0213_key = self.repeat_kv(bat_0213_key, num_kv)
+        bat_0213_value = self.repeat_kv(bat_0213_value, num_kv)
+
+        bsz = query_key_value.shape[0]
+        head_dim = query_key_value.shape[2] // (heads * 3)
+
+        bmm_output = torch.bmm(bat_0213_query.reshape(bsz * heads, bat_0213_query.shape[2], head_dim),
+                               bat_0213_key.reshape(bsz * heads, bat_0213_key.shape[2], head_dim).transpose(1, 2))
+
+        layer_scale = 1.0
+        if alibi is not None and len(alibi.shape) > 1:
+            layer_scale = max(1, layer_id).to(float)
+
+        alpha = norm_factor * norm_factor / layer_scale
+        bmm_output *= alpha
+        bmm_output_reshape = bmm_output.reshape(bsz, heads, bmm_output.shape[1], bmm_output.shape[2])
+
+        recompute = is_prompt
+        if attn_mask is not None and len(attn_mask.shape) > 1 and attn_mask.shape[-1] < bmm_output_reshape.shape[3]:
+            attn_mask = torch.nn.functional.pad(attn_mask, (0, bmm_output_reshape.shape[3] - attn_mask.shape[-1]),
+                                                value=torch.finfo(attn_mask.dtype).min)
+        softmax_output = SoftmaxOp.softmax_fallback(bmm_output_reshape, attn_mask, alibi, triangular_masking,
+                                                    recompute, local_attention, window_size, None, layer_scale, 0, 1)
+
+        output = torch.bmm(softmax_output.reshape(bsz * heads, softmax_output.shape[2], softmax_output.shape[3]),
+                           bat_0213_value.reshape(bsz * heads, bat_0213_value.shape[2], head_dim))
+
+        output = output.reshape(bsz, heads, output.shape[1], head_dim)
+        output = output.reshape(bsz, heads, output.shape[2] * head_dim)
+        input_seq_len = query_key_value.shape[1]
+        t4d_0123_output = self.transform4d_0213(output, input_seq_len)
+        t4d_0123_output = t4d_0123_output.reshape(bsz, t4d_0123_output.shape[1], heads * head_dim)
+
+        if layer_id == num_layers - 1:
+            InferenceContext.Instance().advance_tokens()
+
+        return t4d_0123_output, bat_0213_key, bat_0213_value
 
     def forward(self, query_key_value: torch.Tensor, attn_mask: torch.Tensor, heads: int, num_kv: int,
-                norm_factor: float, no_masking: bool, layer_id: int, num_layers: int, alibi: torch.Tensor):
+                norm_factor: float, no_masking: bool, layer_id: int, num_layers: int, alibi: torch.Tensor,
+                is_prompt: bool, token_idx: torch.Tensor, position_ids: torch.Tensor):
 
         if alibi is not None:
             batch_heads = query_key_value.shape[0] * heads
@@ -42,6 +139,6 @@ def forward(self, query_key_value: torch.Tensor, attn_mask: torch.Tensor, heads:
                                            self.config.rotate_every_two, heads, num_kv, norm_factor,
                                            self.config.triangular_masking, self.config.local_attention,
                                            self.config.window_size, no_masking, layer_id, num_layers, alibi,
-                                           self.config.rope_theta)
+                                           self.config.rope_theta, is_prompt, token_idx, position_ids)
 
         return output
diff --git a/deepspeed/ops/transformer/inference/op_binding/vector_add.py b/deepspeed/ops/transformer/inference/op_binding/vector_add.py
new file mode 100644
index 0000000000000..015340a1084b5
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/vector_add.py
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class VectorAddOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(VectorAddOp, self).__init__(config)
+        try:
+            self.vector_add_func = self.inference_module._vector_add
+        except AttributeError:
+            self.vector_add_func = self.vector_add_fallback
+
+    @classmethod
+    def vector_add_fallback(cls, a, b, gamma):
+        """Based on csrc/transformer/inference/csrc/pt_binding.cpp code of _vector_add"""
+        dtype = a.dtype
+        return (gamma * a.float() + b.float()).to(dtype)
+
+    def forward(self, a, b, gamma):
+        return self.vector_add_func(a, b, gamma)
diff --git a/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py b/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
index 011be859634d5..cabab8d8c4ab1 100644
--- a/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
+++ b/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
@@ -3,7 +3,6 @@
 
 # DeepSpeed Team
 
-import os
 import torch
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
@@ -25,7 +24,7 @@ def __init__(self, config: DeepSpeedInferenceConfig):
                 else:
                     self.vector_matmul_func = self.inference_module.vector_matmul_fp16
             elif self.config.dtype == torch.int8:
-                self.vector_matmul_func = self.inference_module.vector_matmul_fp16
+                self.vector_matmul_func = self.inference_module.vector_matmul_int8
             elif self.config.dtype == torch.bfloat16:
                 self.vector_matmul_func = self.inference_module.vector_matmul_bf16
             else:
@@ -34,10 +33,7 @@ def __init__(self, config: DeepSpeedInferenceConfig):
             self.vector_matmul_func = self.vector_matmul_fallback
 
     def vector_matmul_fallback(self, input, weight, async_op, q_scale, q_int8, transpose):
-        if os.environ.get('DS_KI_FALLBACK') == 'True' and not transpose:
-            return torch.matmul(input, weight)
-        else:
-            raise NotImplementedError
+        return torch.matmul(input, weight.t() if transpose else weight)
 
     def forward(self, input: torch.Tensor, weight: torch.Tensor, async_op: bool = False):
         q_scale = weight.scale if hasattr(weight, 'scale') else torch.empty(1)
diff --git a/deepspeed/ops/transformer/inference/op_binding/workspace.py b/deepspeed/ops/transformer/inference/op_binding/workspace.py
new file mode 100644
index 0000000000000..19de7d9576afe
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/workspace.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+minus_inf = -10000.0
+key_idx = 0
+value_idx = 1
+
+
+class InferenceContext:
+
+    __instance = None
+
+    def __init__(self):
+        self.kv_cache = None
+        self.kv_cache_elem_dtype = None
+        self.num_tokens = 1
+        self.kv_cache_num_layers = None
+        self.kv_cache_size = None
+        self.max_out_tokens = None
+        self.rotary = None
+        self.allocate_called = False
+        self.static_shapes = True
+
+    @classmethod
+    def Instance(cls):
+        if InferenceContext.__instance is None:
+            InferenceContext.__instance = InferenceContext()
+        return InferenceContext.__instance
+
+    def gen_workspace(self, num_layers, num_heads, batch_size, prompt_len, hidden_dim, mp_size, external_cache,
+                      elem_dtype, rank, max_out_tokens, min_out_tokens):
+        self.allocate_called = True
+        self.kv_cache = None
+        if not external_cache:
+            self.kv_cache_num_layers = num_layers
+            self.max_out_tokens = max_out_tokens
+            head_size = hidden_dim // num_heads
+            self.kv_cache_size = torch.Size([batch_size, (num_heads // mp_size), max_out_tokens, head_size])
+            self.kv_cache_elem_dtype = elem_dtype
+        self.num_tokens = 0
+        self.static_shapes = True
+        return True
+
+    def retake_workspace(self):
+        return True
+
+    def _retake_workspace(self):
+        assert self.allocate_called, "retake workspace called before allocate workspace"
+
+        import deepspeed.accelerator as accelerator
+        if self.kv_cache is None:
+            self.kv_cache = []
+            for layer in range(self.kv_cache_num_layers):
+                self.kv_cache.append((torch.zeros(self.kv_cache_size,
+                                                  dtype=self.kv_cache_elem_dtype,
+                                                  device=accelerator.get_accelerator().device_name()),
+                                      torch.zeros(self.kv_cache_size,
+                                                  dtype=self.kv_cache_elem_dtype,
+                                                  device=accelerator.get_accelerator().device_name())))
+
+        return True
+
+    def update_cache(self, layer_id, token_idx, is_prompt, bat_0213_key, bat_0213_value):
+        has_workspace = self._retake_workspace()
+        assert has_workspace, "Could not allocate workspace"
+
+        # Update current token
+        if is_prompt:
+            self.static_shapes = True
+            if token_idx is None:
+                self.static_shapes = False
+                InferenceContext.Instance().reset_tokens(bat_0213_key.shape[2])
+            else:
+                InferenceContext.Instance().reset_tokens(token_idx)
+
+        if token_idx is None:
+            token_idx = InferenceContext.Instance().current_tokens()
+
+        bsz = bat_0213_key.shape[0]
+
+        # Update cache content
+        if is_prompt:
+            cache_max_seq = self.kv_cache_size[2]
+            cache_max_head_dim = self.kv_cache_size[3]
+            seq = bat_0213_key.shape[2]
+
+            mask = torch.arange(cache_max_seq, device=bat_0213_key.device)
+            mask = mask.ge(token_idx)
+            mask = mask.unsqueeze(-1)
+            mask = mask.expand([cache_max_seq, cache_max_head_dim])
+
+            self.kv_cache[layer_id][key_idx][:bsz, :, :seq, :].copy_(bat_0213_key)
+            self.kv_cache[layer_id][key_idx][:bsz, :].masked_fill_(mask, 0)
+            self.kv_cache[layer_id][value_idx][:bsz, :, :seq, :].copy_(bat_0213_value)
+            self.kv_cache[layer_id][value_idx][:bsz, :].masked_fill_(mask, 0)
+        else:
+            if self.static_shapes:
+                assert type(token_idx) == torch.Tensor, "token_idx is expected to be torch.Tensor"
+                self.kv_cache[layer_id][key_idx][:bsz].index_copy_(2, token_idx - 1, bat_0213_key)
+                self.kv_cache[layer_id][value_idx][:bsz].index_copy_(2, token_idx - 1, bat_0213_value)
+            else:
+                assert type(token_idx) == int, "token_idx is expected to be int"
+                self.kv_cache[layer_id][key_idx][:bsz, :, token_idx - 1:token_idx, :] = bat_0213_key
+                self.kv_cache[layer_id][value_idx][:bsz, :, token_idx - 1:token_idx, :] = bat_0213_value
+
+        bat_0213_key = self.kv_cache[layer_id][key_idx][:bsz]
+        bat_0213_value = self.kv_cache[layer_id][value_idx][:bsz]
+
+        if not self.static_shapes:
+            bat_0213_key = bat_0213_key[:, :, :token_idx, :]
+            bat_0213_value = bat_0213_value[:, :, :token_idx, :]
+
+        return bat_0213_key, bat_0213_value
+
+    def release_workspace(self):
+        self.kv_cache = None
+        self.rotary = None
+
+    def reset_tokens(self, initial_tokens=1):
+        self.num_tokens = initial_tokens
+
+    def current_tokens(self):
+        return self.num_tokens
+
+    def advance_tokens(self):
+        self.num_tokens = self.num_tokens + 1
+
+    def get_kv_cache(self):
+        return self.kv_cache
+
+    def get_rotary(self, rotary_dim, rope_theta, device=None):
+        if self.rotary is None:
+            from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding
+
+            self.rotary = LlamaRotaryEmbedding(rotary_dim, base=rope_theta, device=device)
+
+        return self.rotary
+
+    def get_max_tokens_num(self):
+        return self.max_out_tokens
+
+
+class WorkspaceOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        self.inference_context = InferenceContext.Instance()
+        self._is_allocated = False
+        try:
+            super(WorkspaceOp, self).__init__(config)
+            if config.dtype == torch.float32:
+                self.allocate_workspace_func = self.inference_module.allocate_workspace_fp32
+            elif config.dtype == torch.bfloat16:
+                self.allocate_workspace_func = self.inference_module.allocate_workspace_bf16
+            else:
+                self.allocate_workspace_func = self.inference_module.allocate_workspace_fp16
+            self.release_workspace_func = self.inference_module.release_workspace
+            self.retake_workspace_func = self.inference_module.retake_workspace
+            self.reset_cache_func = self.inference_module.reset_cache
+        except (ValueError, AttributeError) as e:
+            print(f"Using fallback functions in workspace because of {e}")
+            if config.dtype == torch.float32:
+                self.allocate_workspace_func = self.allocate_workspace_fp32_fallback
+            elif config.dtype == torch.bfloat16:
+                self.allocate_workspace_func = self.allocate_workspace_bf16_fallback
+            else:
+                self.allocate_workspace_func = self.allocate_workspace_fp16_fallback
+            self.release_workspace_func = self.release_workspace_fallback
+            self.retake_workspace_func = self.retake_workspace_fallback
+            self.reset_cache_func = self.reset_cache_fallback
+
+    def allocate_workspace(self, *args, **kwargs):
+        self._is_allocated = True
+        return self.allocate_workspace_func(*args, **kwargs)
+
+    def release_workspace(self):
+        self._is_allocated = False
+        return self.release_workspace_func()
+
+    def reset_cache(self):
+        return self.reset_cache_func() if self.reset_cache_func else None
+
+    def retake_workspace(self):
+        return self.retake_workspace_func() if self.retake_workspace_func else None
+
+    def allocate_workspace_fp32_fallback(self, hidden_dim, num_heads, prompt_length, batch_size, num_layers, mp_size,
+                                         external_cache, rank, max_out_tokens, min_out_tokens):
+        return self.inference_context.gen_workspace(num_layers, num_heads, batch_size, prompt_length, hidden_dim,
+                                                    mp_size, external_cache, torch.float, rank, max_out_tokens,
+                                                    min_out_tokens)
+
+    def allocate_workspace_bf16_fallback(self, hidden_dim, num_heads, prompt_length, batch_size, num_layers, mp_size,
+                                         external_cache, rank, max_out_tokens, min_out_tokens):
+        return self.inference_context.gen_workspace(num_layers, num_heads, batch_size, prompt_length, hidden_dim,
+                                                    mp_size, external_cache, torch.bfloat16, rank, max_out_tokens,
+                                                    min_out_tokens)
+
+    def allocate_workspace_fp16_fallback(self, hidden_dim, num_heads, prompt_length, batch_size, num_layers, mp_size,
+                                         external_cache, rank, max_out_tokens, min_out_tokens):
+        return self.inference_context.gen_workspace(num_layers, num_heads, batch_size, prompt_length, hidden_dim,
+                                                    mp_size, external_cache, torch.half, rank, max_out_tokens,
+                                                    min_out_tokens)
+
+    def reset_cache_fallback(self):
+        return self.inference_context.reset_tokens()
+
+    def release_workspace_fallback(self):
+        return self.inference_context.release_workspace()
+
+    def retake_workspace_fallback(self):
+        return self.inference_context.retake_workspace()
+
+    def is_allocated(self):
+        return self._is_allocated
diff --git a/deepspeed/ops/transformer/inference/triton/attention.py b/deepspeed/ops/transformer/inference/triton/attention.py
index c05370ec74e59..6845d91b06be9 100644
--- a/deepspeed/ops/transformer/inference/triton/attention.py
+++ b/deepspeed/ops/transformer/inference/triton/attention.py
@@ -125,7 +125,7 @@ def _triton_autotune(min_seqlen,
             context_4d_matmul(output, qkv, head_size)
         Fp16Matmul._update_autotune_table()
 
-    def ds_compute_attention(self, qkv_out, input_mask, layer_past, alibi):
+    def ds_compute_attention(self, qkv_out, input_mask, layer_past, alibi, is_prompt, token_idx, position_ids):
         if isinstance(qkv_out, list):
             qkv_out = qkv_out[0]
 
@@ -143,7 +143,10 @@ def ds_compute_attention(self, qkv_out, input_mask, layer_past, alibi):
             no_masking=no_masking,
             layer_id=self.config.layer_id,
             num_layers=TritonSelfAttention.num_layers,
-            alibi=alibi)
+            alibi=alibi,
+            is_prompt=is_prompt,
+            token_idx=token_idx,
+            position_ids=position_ids)
 
         context_layer, key_layer, value_layer = attn_key_value
         return context_layer, key_layer, value_layer
@@ -161,7 +164,8 @@ def forward(
             norm_w=None,
             norm_b=None,
             alibi=None,
-            use_triton_attention=True):
+            use_triton_attention=True,
+            **kwargs):
 
         if not self.config.pre_layer_norm:
             qkv_out = self.linear_func(input=input,
@@ -192,10 +196,16 @@ def forward(
                                               triangular=self.triangular_masking)
             key_layer, value_layer = qkv[:, :, self.hidden_size:2 * self.hidden_size], qkv[:, :, 2 * self.hidden_size:]
         else:
+            is_prompt = kwargs.get("first_token", qkv_out[0].shape[1] > 1)
+            token_idx = kwargs.get("token_idx", None)
+            position_ids = kwargs.get("position_ids", None)
             context_layer, key_layer, value_layer = self.ds_compute_attention(qkv_out=qkv_out,
                                                                               input_mask=input_mask,
                                                                               layer_past=layer_past,
-                                                                              alibi=alibi)
+                                                                              alibi=alibi,
+                                                                              is_prompt=is_prompt,
+                                                                              toke_idx=token_idx,
+                                                                              position_ids=position_ids)
         output = self.vector_matmul_func(input=context_layer, weight=self.attn_ow)
 
         inp_norm = qkv_out[-1]
diff --git a/deepspeed/ops/transformer/inference/triton/ops.py b/deepspeed/ops/transformer/inference/triton/ops.py
index dd87d08d4d2c6..dbed453137806 100644
--- a/deepspeed/ops/transformer/inference/triton/ops.py
+++ b/deepspeed/ops/transformer/inference/triton/ops.py
@@ -3,12 +3,10 @@
 
 # DeepSpeed Team
 
-import deepspeed
-from deepspeed.ops.op_builder import InferenceBuilder
 import deepspeed.ops.transformer.inference.triton.matmul_ext as matmul_ext
+from deepspeed.ops.transformer.inference.op_binding.layer_norm import LayerNormOp
 from deepspeed.ops.transformer.inference.triton.layer_norm import layer_norm, layer_norm_residual
-
-inference_module = None
+from deepspeed.utils.types import ActivationFuncType
 
 
 def vector_matmul_func(input, weight, async_op, q_scale, q_int8, transposed_mode):
@@ -76,15 +74,12 @@ def mlp_gemm_func(input,
     if use_triton_ln:
         mlp_input = layer_norm_residual(input, input_bias, residual, gamma, beta, epsilon)
     else:
-        global inference_module
-        if inference_module is None:
-            inference_module = InferenceBuilder().load()
-        mlp_input = inference_module._layer_norm_residual(input, input_bias, residual, gamma, beta, epsilon)
+        mlp_input = LayerNormOp.layer_norm_residual(input, input_bias, residual, gamma, beta, epsilon)
 
     # activation
-    if deepspeed.utils.types.ActivationFuncType(mlp_act_func_type) == deepspeed.utils.types.ActivationFuncType.GELU:
+    if ActivationFuncType(mlp_act_func_type) == ActivationFuncType.GELU:
         activation = "gelu"
-    elif deepspeed.utils.types.ActivationFuncType(mlp_act_func_type) == deepspeed.utils.types.ActivationFuncType.ReLU:
+    elif ActivationFuncType(mlp_act_func_type) == ActivationFuncType.ReLU:
         activation = "relu"
     else:
         activation = ""
@@ -121,10 +116,7 @@ def qkv_gemm_func(
     if use_triton_ln:
         qkv_input = layer_norm(input, gamma, beta, epsilon)
     else:
-        global inference_module
-        if inference_module is None:
-            inference_module = InferenceBuilder().load()
-        qkv_input = inference_module.layer_norm(input, gamma, beta, epsilon)
+        qkv_input = LayerNormOp()(input, gamma, beta, epsilon)
 
     qkv_out = matmul_ext.matmul(qkv_input, weight, bias=(bias if add_bias else None), activation="", use_triton=True)
 
diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index f955cf5ebcad6..d65126999199c 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -36,8 +36,10 @@
 
 # MP parameters
 mpu = None
-mp_rank = None
-mp_size = None
+
+#set default values
+mp_rank = 0
+mp_size = 1
 mp_group = None
 
 # Model Parameters
@@ -61,8 +63,6 @@
 
 # Default name for the model parallel rng tracker.
 _MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
-transport_stream = None
-cuda_device = None
 
 
 def detach_variable(inputs, device=None):
@@ -518,35 +518,10 @@ def save_args_for_backward(*all_args):
         global mp_rank, mp_size, mp_group
         global contiguous_data_buffers, contiguous_size_buffers
         global data_offsets, size_offsets
-        if mp_rank is None:
-            if mpu is not None:
-                if hasattr(mpu, 'get_tensor_model_parallel_rank'):
-                    mp_rank = mpu.get_tensor_model_parallel_rank()
-                    mp_size = mpu.get_tensor_model_parallel_world_size()
-                    mp_group = mpu.get_tensor_model_parallel_group()
-                else:
-                    mp_rank = mpu.get_model_parallel_rank()
-                    mp_size = mpu.get_model_parallel_world_size()
-                    mp_group = mpu.get_model_parallel_group()
-            else:
-                mp_rank = 0
-                mp_size = 1
-                mp_group = None
-
-        global cuda_device, transport_stream, PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset
-
-        if cuda_device is None:
-            see_memory_usage("First Forward Beginning", force=False)
-            if dist.get_rank() == 0:
-                logger.info(f"Activation Checkpointing Information")
-                logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}")
-                logger.info(
-                    f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers")
-                logger.info(f"----Synchronization {SYNCHRONIZE}")
-                logger.info(f"----Profiling time in checkpointing {PROFILE_TIME}")
+        global PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset
 
-            cuda_device = get_accelerator().current_device_name()
-            transport_stream = get_accelerator().Stream(device=cuda_device)
+        cuda_device = get_accelerator().current_device_name()
+        transport_stream = get_accelerator().Stream(device=cuda_device)
 
         if PARTITION_ACTIVATIONS:
             inputs = partition_activations(args, CPU_CHECKPOINT, CONTIGUOUS_CHECKPOINTING)
@@ -631,8 +606,9 @@ def backward(ctx, *grads):
             raise RuntimeError("Checkpointing is not compatible with .grad(), "
                                "please use .backward() if possible")
 
-        global cuda_device, transport_stream, PARTITION_ACTIVATIONS
-
+        global PARTITION_ACTIVATIONS
+        cuda_device = get_accelerator().current_device_name()
+        transport_stream = get_accelerator().Stream(device=cuda_device)
         # Rebuild deepspeed_saved_tensors
         for t in ctx.deepspeed_saved_tensors:
             if t is not None and hasattr(t, 'saved_data') and t.saved_data is not None:
@@ -764,35 +740,10 @@ def save_args_for_backward(*all_args):
     global mp_rank, mp_size, mp_group
     global contiguous_data_buffers, contiguous_size_buffers
     global data_offsets, size_offsets
-    if mp_rank is None:
-        if mpu is not None:
-            if hasattr(mpu, 'get_tensor_model_parallel_rank'):
-                mp_rank = mpu.get_tensor_model_parallel_rank()
-                mp_size = mpu.get_tensor_model_parallel_world_size()
-                mp_group = mpu.get_tensor_model_parallel_group()
-            else:
-                mp_rank = mpu.get_model_parallel_rank()
-                mp_size = mpu.get_model_parallel_world_size()
-                mp_group = mpu.get_model_parallel_group()
-        else:
-            mp_rank = 0
-            mp_size = 1
-            mp_group = None
-
-    global cuda_device, transport_stream, PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset
-
-    if cuda_device is None:
-        see_memory_usage("First Forward Beginning", force=False)
-        if dist.get_rank() == 0:
-            logger.info(f"Activation Checkpointing Information")
-            logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}")
-            logger.info(
-                f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers")
-            logger.info(f"----Synchronization {SYNCHRONIZE}")
-            logger.info(f"----Profiling time in checkpointing {PROFILE_TIME}")
+    global PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset
 
-        cuda_device = get_accelerator().current_device_name()
-        transport_stream = get_accelerator().Stream(device=cuda_device)
+    cuda_device = get_accelerator().current_device_name()
+    transport_stream = get_accelerator().Stream(device=cuda_device)
 
     if PARTITION_ACTIVATIONS:
         inputs = partition_activations(args, CPU_CHECKPOINT, CONTIGUOUS_CHECKPOINTING)
@@ -899,7 +850,9 @@ def replay_unpack(none_value):
                 raise RuntimeError("Checkpointing is not compatible with .grad(), "
                                    "please use .backward() if possible")
 
-            global cuda_device, transport_stream, PARTITION_ACTIVATIONS
+            global PARTITION_ACTIVATIONS
+            cuda_device = get_accelerator().current_device_name()
+            transport_stream = get_accelerator().Stream(device=cuda_device)
 
             # gather inputs which is partitioned or checkpointed before first forward
             if PARTITION_ACTIVATIONS:
@@ -1152,6 +1105,27 @@ def configure(
     if CONTIGUOUS_CHECKPOINTING:
         assert num_layers is not None, "Must specify the number of layers with contiguous memory checkpointing"
 
+    global mp_rank, mp_size, mp_group
+
+    if mpu is not None:
+        if hasattr(mpu, 'get_tensor_model_parallel_rank'):
+            mp_rank = mpu.get_tensor_model_parallel_rank()
+            mp_size = mpu.get_tensor_model_parallel_world_size()
+            mp_group = mpu.get_tensor_model_parallel_group()
+        else:
+            mp_rank = mpu.get_model_parallel_rank()
+            mp_size = mpu.get_model_parallel_world_size()
+            mp_group = mpu.get_model_parallel_group()
+
+    #print configuration only once
+    see_memory_usage("After configuration", force=False)
+    if dist.get_rank() == 0:
+        logger.info(f"Activation Checkpointing Information")
+        logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}")
+        logger.info(f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers")
+        logger.info(f"----Synchronization {SYNCHRONIZE}")
+        logger.info(f"----Profiling time in checkpointing {PROFILE_TIME}")
+
 
 def is_configured():
     """True if deepspeed activation checkpointing has been configured
diff --git a/deepspeed/runtime/constants.py b/deepspeed/runtime/constants.py
index 679230ca7d4c3..55cfa8f59c91a 100755
--- a/deepspeed/runtime/constants.py
+++ b/deepspeed/runtime/constants.py
@@ -77,7 +77,7 @@
 
 # Steps
 STEPS_PER_PRINT = "steps_per_print"
-STEPS_PER_PRINT_DEFAULT = 10
+STEPS_PER_PRINT_DEFAULT = None
 
 #########################################
 # Training micro batch size per GPU
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 61e6da2663cf8..05bb23e8ddd94 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -18,13 +18,13 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
-from typing import Callable, Dict, Union, Iterable
+from typing import Callable, Dict, Union, Iterable, Container
 
 import deepspeed
 
 from deepspeed import comm as dist
 from deepspeed.runtime.utils import see_memory_usage, DummyOptim
-from .zero.offload_config import OffloadDeviceEnum
+from .zero.offload_config import OffloadDeviceEnum, OffloadStateTypeEnum
 from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 from deepspeed.runtime.zero.utils import is_zero_supported_optimizer, ZeRORuntimeException
@@ -1928,7 +1928,7 @@ def _cast_inputs_half(self, inputs):
             for k, v in inputs.items():
                 new_inputs[k] = self._cast_inputs_half(v)
             return new_inputs
-        elif hasattr(inputs, 'half'):
+        elif hasattr(inputs, 'half') and inputs.is_floating_point():
             return inputs.half()
         else:
             return inputs
@@ -2145,8 +2145,6 @@ def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
         else:
             self.zero_grad()
 
-        report_progress = self.global_rank == 0 if self.global_rank else True
-
         # Check overflow here since in DS fp16 optimizer, the overflow is updated in above step() function.
         overflow = False
         if hasattr(self.optimizer, "overflow"):
@@ -2166,8 +2164,10 @@ def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
                     # pipe_engine.train_batch()
                     self.lr_scheduler.step(self.train_batch_size())
 
-        if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
-            self._report_progress(self.global_steps + 1)
+        if self.steps_per_print() is not None:
+            report_progress = self.global_rank == 0 if self.global_rank else True
+            if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
+                self._report_progress(self.global_steps + 1)
 
         self.losses = None
         self.global_steps += 1
@@ -3675,9 +3675,50 @@ def compile(self, backend=get_accelerator().get_compile_backend(), compile_kwarg
         if self.is_compiled:
             return
 
-        self.module.compile(backend=backend, **compile_kwargs)
+        if 'backend' in compile_kwargs:
+            logger.warning("The `backend` in `compile_kwargs` will be overridden. Use the `backend` argument instead.")
+
+        # create new dict to avoid modifying original dict
+        self.module.compile(**{**compile_kwargs, 'backend': backend})
         self._is_compiled = True
 
     @property
     def is_compiled(self) -> bool:
         return self._is_compiled
+
+    def offload_states(self,
+                       include: Container[OffloadStateTypeEnum] = None,
+                       device: OffloadDeviceEnum = OffloadDeviceEnum.cpu,
+                       pin_memory: bool = True,
+                       non_blocking: bool = False) -> None:
+        """Offload the engine's states to the specified device.
+
+        Arguments:
+            include: Optional. The set of states to offload. If not provided, all states are offloaded.
+            device: Optional. The device to move the ZeRO optimizer buffers to. Currently only `OffloadDeviceEnum.cpu` is supported.
+            pin_memory: Optional. Whether to pin the memory of the offloaded states.
+            non_blocking: Optional. Whether to offload the states asynchronously.
+        """
+        assert self.zero_optimization_stage(
+        ) == ZeroStageEnum.weights, "Moving buffers across devices is supported only for ZeRO stage 3."
+
+        assert not self.zero_offload_param(), "Moving states across devices is not supported for offloaded parameters."
+
+        if device == OffloadDeviceEnum.none:
+            logger.warning("No device specified for offloading states.")
+            return
+
+        if device == OffloadDeviceEnum.nvme:
+            raise ValueError("NVMe offload is not supported for offloading states.")
+
+        self.optimizer.offload_states(include=include, device=device, pin_memory=pin_memory, non_blocking=non_blocking)
+
+    def reload_states(self, non_blocking: bool = False) -> None:
+        """Reload the engine states to the original device.
+
+        Arguments:
+            non_blocking: Optional. Whether to offload the states asynchronously.
+        """
+        assert self.zero_optimization_stage(
+        ) == ZeroStageEnum.weights, "Moving buffers back is supported only for ZeRO stage 3."
+        self.optimizer.reload_states(non_blocking=non_blocking)
diff --git a/deepspeed/runtime/hybrid_engine.py b/deepspeed/runtime/hybrid_engine.py
index a991c43045635..8a6311bb6e834 100644
--- a/deepspeed/runtime/hybrid_engine.py
+++ b/deepspeed/runtime/hybrid_engine.py
@@ -17,16 +17,14 @@
 from deepspeed.accelerator import get_accelerator
 from torch import nn
 from deepspeed.utils import logger
-
-from deepspeed.ops.op_builder import InferenceBuilder
-
 from deepspeed.module_inject.layers import LinearLayer, Normalize, EmbeddingLayer, OPTEmbedding
+from ..ops.transformer.inference.op_binding.workspace import WorkspaceOp
+
 try:
     import transformers
     OPTLearnedPositionalEmbedding = transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding
 except:
     OPTLearnedPositionalEmbedding = None
-inference_cuda_module = None
 
 
 class DeepSpeedHybridEngine(DeepSpeedEngine):
@@ -61,12 +59,8 @@ def __init__(self, args, model, **kwargs):
         self._total_batch_size = None
         self._gather_latency = 0
 
-        global inference_cuda_module
-        if inference_cuda_module is None:
-            builder = InferenceBuilder()
-            inference_cuda_module = builder.load()
-
         self.is_lora_fused = False
+        self.workspace = WorkspaceOp()
 
     def convert_to_linear_transposed(self, model):
 
@@ -160,13 +154,13 @@ def unfuse_lora_weight_non_pinned(self):
 
     def retake_inference_cache(self):
         if self._config.hybrid_engine.release_inference_cache:
-            retake_success = inference_cuda_module.retake_workspace()
+            retake_success = self.workspace.retake_workspace()
 
             if not retake_success:
                 logger.warning("Unable to acquire workspace on first attempt, emptying cache and retrying.")
                 gc.collect()
                 get_accelerator().empty_cache()
-                retake_success = inference_cuda_module.retake_workspace()
+                retake_success = self.workspace.retake_workspace()
 
                 if not retake_success:
                     raise RuntimeError("Unable to retake inference workspace.")
@@ -269,7 +263,7 @@ def generate(self, *inputs, **kwargs):
                 self.is_lora_fused = False
 
         if self._config.hybrid_engine.release_inference_cache:
-            inference_cuda_module.release_workspace()
+            self.workspace.release_workspace()
             gc.collect()
             get_accelerator().empty_cache()
 
diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py
index d7f7e15a4dbda..f25a19e8e4991 100755
--- a/deepspeed/runtime/lr_schedules.py
+++ b/deepspeed/runtime/lr_schedules.py
@@ -247,6 +247,12 @@ def get_lr_from_config(config):
     return lr_params[WARMUP_MAX_LR], ''
 
 
+def update_lr(param_groups, lrs):
+    for param_group, lr in zip(param_groups, lrs):
+        param_group['lr'] = lr
+    return [group['lr'] for group in param_groups]
+
+
 """
 Only optimizers that are subclass of torch.optim.Optimizer are supported. So check the passed optimizer and wrapped
 optimizer to see if requirement is satisfied.
@@ -328,7 +334,7 @@ def __init__(self,
         self.interval_fn = self._staircase_interval if lr_range_test_staircase else self._continuous_interval
 
         if last_batch_iteration == -1:
-            self._update_optimizer(self.min_lr)
+            self._last_lr = update_lr(self.optimizer.param_groups, self.min_lr)
 
     def _staircase_interval(self):
         return math.floor(float(self.last_batch_iteration + 1) / self.step_size)
@@ -349,16 +355,11 @@ def get_last_lr(self):
         assert getattr(self, '_last_lr', None) is not None, "need to call step() first"
         return self._last_lr
 
-    def _update_optimizer(self, group_lrs):
-        for param_group, lr in zip(self.optimizer.param_groups, group_lrs):
-            param_group['lr'] = lr
-
     def step(self, batch_iteration=None):
         if batch_iteration is None:
             batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = batch_iteration
-        self._update_optimizer(self.get_lr())
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+        self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
 
     def state_dict(self):
         return {'last_batch_iteration': self.last_batch_iteration}
@@ -615,9 +616,7 @@ def step(self, batch_iteration=None):
             batch_iteration = self.last_batch_iteration + 1
 
         self.last_batch_iteration = batch_iteration
-        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
-            param_group['lr'] = lr
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+        self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
 
         if self.cycle_momentum:
             momentums = self.get_mom()
@@ -675,11 +674,14 @@ def __init__(self,
         self.warmup_type = warmup_type
         self.inverse_log_warm_up = 1.0 / math.log(self.warmup_num_steps)
         self.last_batch_iteration = last_batch_iteration
+        # Initialize lr in optimizer
+        if last_batch_iteration == -1:
+            self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
 
     def get_lr(self):
         if self.last_batch_iteration < 0:
             logger.warning("Attempting to get learning rate from scheduler before it has started")
-            return [0.0]
+            return self.min_lrs
         gamma = self._get_gamma()
         return [min_lr + (delta_lr * gamma) for min_lr, delta_lr in zip(self.min_lrs, self.delta_lrs)]
 
@@ -693,9 +695,7 @@ def step(self, last_batch_iteration=None):
         if last_batch_iteration is None:
             last_batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = last_batch_iteration
-        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
-            param_group['lr'] = lr
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+        self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
 
     def state_dict(self):
         return {'last_batch_iteration': self.last_batch_iteration}
@@ -819,6 +819,10 @@ def __init__(self,
                 total_num_steps, warmup_num_steps))
         self.org_lrs = [group['lr'] for group in self.optimizer.param_groups]
 
+        # Initialize lrs in optimizer groups
+        if last_batch_iteration == -1:
+            self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
+
     def get_lr_ratio(self):
         if self.last_batch_iteration < 0:
             logger.warning("Attempting to get learning rate from scheduler before it has started")
@@ -844,11 +848,7 @@ def step(self, last_batch_iteration=None):
         if last_batch_iteration is None:
             last_batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = last_batch_iteration
-
-        lrs = self.get_lr()
-        for param_group, lr in zip(self.optimizer.param_groups, lrs):
-            param_group['lr'] = lr
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+        self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
 
     def get_lr(self):
         if self.last_batch_iteration < 0:
diff --git a/deepspeed/runtime/pipe/p2p.py b/deepspeed/runtime/pipe/p2p.py
index 2b12a9573c4b9..ed6d80b8d4fbf 100644
--- a/deepspeed/runtime/pipe/p2p.py
+++ b/deepspeed/runtime/pipe/p2p.py
@@ -3,7 +3,7 @@
 
 # DeepSpeed Team
 
-import pickle
+import msgpack
 import typing
 
 import torch
@@ -96,7 +96,7 @@ def wait():
 def send_obj(msg: typing.Any, dest: int):
     """Send an arbitrary python object to ``dest``.
 
-    Note: ``msg`` must be pickleable.
+    Note: ``msg`` must be serializable by msgpack.
 
     WARN: This incurs a CPU -> GPU transfer and should be used sparingly
     for performance reasons.
@@ -106,7 +106,7 @@ def send_obj(msg: typing.Any, dest: int):
         dest (int): Destination rank.
     """
     # serialize the message
-    msg = pickle.dumps(msg)
+    msg = msgpack.packb(msg)
     # construct a tensor to send
     msg = torch.ByteTensor(torch.ByteStorage.from_buffer(msg)).to(get_accelerator().device_name())
 
@@ -133,7 +133,7 @@ def recv_obj(sender: int) -> typing.Any:
     msg = torch.empty(length.item(), dtype=torch.uint8).to(get_accelerator().device_name())
     dist.recv(msg, src=sender)
 
-    msg = pickle.loads(msg.cpu().numpy().tobytes())
+    msg = msgpack.unpackb(msg.cpu().numpy().tobytes())
 
     def _to(x):
         """Recursively move to the current device."""
diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index 2c01c3475a70a..b9617d3e632fd 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -9,28 +9,28 @@
 """
 
 from collections.abc import Iterable
-from deepspeed.moe.utils import is_moe_param
 import os
 import psutil
 import gc
 from math import sqrt
 
+from numpy import prod
+
 import torch
-from deepspeed import comm as dist
+from torch.nn import functional as F
 try:
     from torch._six import inf
 except ModuleNotFoundError:
     from torch import inf
 
+from deepspeed import comm as dist
+from deepspeed.moe.utils import is_moe_param
 from deepspeed.utils import groups, logger
 from deepspeed.utils.bwc import (bwc_tensor_model_parallel_rank, bwc_pipeline_parallel_world_size,
                                  bwc_pipeline_parallel_group)
 from deepspeed.runtime.constants import PIPE_REPLICATED
-from numpy import prod
 from deepspeed.accelerator import get_accelerator
-
 from deepspeed.module_inject.policy import transpose
-from torch.nn import functional as F
 
 torch_memory_reserved = get_accelerator().memory_reserved
 torch_max_memory_reserved = get_accelerator().max_memory_reserved
@@ -1065,3 +1065,39 @@ def to_tensor(v):
             total_norm = -1
 
     return total_norm
+
+
+def _make_offload_state_key(key):
+    return f"{key}_offload_buffer"
+
+
+def offload_adam_states(optimizer, device, pin_memory: bool = False, non_blocking: bool = False):
+    """Move optimizer states to device. Note that this assumes the state structure of DeepSpeed Adam."""
+
+    def move_key(state, key):
+        offload_buf_key = _make_offload_state_key(key)
+        if offload_buf_key not in state:
+            state[offload_buf_key] = torch.empty_like(state[key], device=device)
+            if pin_memory:
+                state[offload_buf_key] = get_accelerator().pin_memory(state[offload_buf_key])
+        state[offload_buf_key].copy_(state[key], non_blocking=non_blocking)
+        state[key].data = state[offload_buf_key]
+
+    for _, state in optimizer.state.items():
+        if "exp_avg" in state:
+            move_key(state, "exp_avg")
+        if "exp_avg_sq" in state:
+            move_key(state, "exp_avg_sq")
+
+
+def reload_adam_states(optimizer, device, non_blocking: bool = False):
+    """Move optimizer states to device. Note that this assumes the state structure of DeepSpeed Adam."""
+
+    def move_back_key(state, key):
+        state[key].data = state[_make_offload_state_key(key)].to(device, non_blocking=non_blocking)
+
+    for _, state in optimizer.state.items():
+        if "exp_avg" in state:
+            move_back_key(state, "exp_avg")
+        if "exp_avg_sq" in state:
+            move_back_key(state, "exp_avg_sq")
diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py
index 74a5673bc1bc7..ca35d7a7d1691 100644
--- a/deepspeed/runtime/zero/offload_config.py
+++ b/deepspeed/runtime/zero/offload_config.py
@@ -98,3 +98,12 @@ def set_pipeline(self):
         pipeline = self.pipeline_read or self.pipeline_write
         self.__dict__["pipeline"] = pipeline
         return self
+
+
+class OffloadStateTypeEnum(str, Enum):
+    """ Enum for internal buffer types """
+    optim_states = "optim_states"
+    hp_params = "hp_params"
+    lp_params = "lp_params"
+    lp_grads = "lp_grads"
+    contiguous_grad_buffer = "contiguous_grad_buffer"
diff --git a/deepspeed/runtime/zero/offload_states.py b/deepspeed/runtime/zero/offload_states.py
new file mode 100644
index 0000000000000..f521a11a7aa45
--- /dev/null
+++ b/deepspeed/runtime/zero/offload_states.py
@@ -0,0 +1,74 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Set
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.zero.offload_config import OffloadStateTypeEnum
+
+from deepspeed.utils.tensor_fragment import safe_get_local_fp32_param, safe_get_local_optimizer_state
+
+
+def _make_offload_state_key(key):
+    return f"{key}_offload_buffer"
+
+
+def offload_adam_states(optimizer, device, pin_memory: bool = False, non_blocking: bool = False):
+    """Move optimizer states to device. Note that this assumes the state structure of DeepSpeed Adam."""
+
+    def move_key(state, key):
+        offload_buf_key = _make_offload_state_key(key)
+        if offload_buf_key not in state:
+            state[offload_buf_key] = torch.empty_like(state[key], device=device)
+            if pin_memory:
+                state[offload_buf_key] = get_accelerator().pin_memory(state[offload_buf_key])
+        state[offload_buf_key].copy_(state[key], non_blocking=non_blocking)
+        state[key].data = state[offload_buf_key]
+
+    for _, state in optimizer.state.items():
+        if "exp_avg" in state:
+            move_key(state, "exp_avg")
+        if "exp_avg_sq" in state:
+            move_key(state, "exp_avg_sq")
+
+
+def reload_adam_states(optimizer, device, non_blocking: bool = False):
+    """Move optimizer states to device. Note that this assumes the state structure of DeepSpeed Adam."""
+
+    def move_back_key(state, key):
+        state[key].data = state[_make_offload_state_key(key)].to(device, non_blocking=non_blocking)
+
+    for _, state in optimizer.state.items():
+        if "exp_avg" in state:
+            move_back_key(state, "exp_avg")
+        if "exp_avg_sq" in state:
+            move_back_key(state, "exp_avg_sq")
+
+
+def get_state_devices(model, state: OffloadStateTypeEnum) -> Set[torch.device]:
+    """Retrieve the devices of the specified state of the model.
+
+    Args:
+        model (DeepSpeedEngine): The model whose device allocations are to be checked.
+        state (OffloadStateTypeEnum): The specific state for which the devices should be retrieved.
+
+    Returns:
+        Set[torch.device]: A set of devices of the specified state.
+
+    """
+    if state == OffloadStateTypeEnum.hp_params:
+        return set(safe_get_local_fp32_param(p).device for p in model.parameters())
+    elif state == OffloadStateTypeEnum.lp_params:
+        return set(p.ds_tensor.device for p in model.parameters())
+    elif state == OffloadStateTypeEnum.lp_grads:
+        return {model.optimizer.grad_partitions_flat_buffer.device}
+    elif state == OffloadStateTypeEnum.optim_states:
+        return set(safe_get_local_optimizer_state(p, "exp_avg").device for p in model.parameters()) | \
+               set(safe_get_local_optimizer_state(p, "exp_avg_sq").device for p in model.parameters())
+    elif state == OffloadStateTypeEnum.contiguous_grad_buffer:
+        if model.optimizer._DeepSpeedZeroOptimizer_Stage3__ipg_bucket_flat_buffer == None:
+            return {}
+        return {model.optimizer._DeepSpeedZeroOptimizer_Stage3__ipg_bucket_flat_buffer.device}
diff --git a/deepspeed/runtime/zero/parameter_offload.py b/deepspeed/runtime/zero/parameter_offload.py
index 92702edb733b1..1ce2414a1e17c 100644
--- a/deepspeed/runtime/zero/parameter_offload.py
+++ b/deepspeed/runtime/zero/parameter_offload.py
@@ -38,7 +38,7 @@ def _apply_forward_and_backward_to_tensors_only(module, forward_function, backwa
 
 class ZeROOrderedDict(OrderedDict):
 
-    def __init__(self, parent_module, *args, **kwargs):
+    def __init__(self, parent_module=None, *args, **kwargs):
         """A replacement for ``collections.OrderedDict`` to detect external ZeRO params.
 
         Args:
@@ -56,7 +56,7 @@ def __getitem__(self, key):
         if param is None:
             return param
 
-        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+        if hasattr(param, "ds_status") and param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
             if self._parent_module._parameters._in_forward:
                 register_external_parameter(FWD_MODULE_STACK[-1], param)
                 param.all_gather()
diff --git a/deepspeed/runtime/zero/partitioned_param_coordinator.py b/deepspeed/runtime/zero/partitioned_param_coordinator.py
index bdec8a55fcbc5..5780b2afd6def 100644
--- a/deepspeed/runtime/zero/partitioned_param_coordinator.py
+++ b/deepspeed/runtime/zero/partitioned_param_coordinator.py
@@ -153,11 +153,18 @@ def is_invalid_trace(self) -> bool:
     def is_record_trace(self) -> bool:
         return self.__trace_mode == ZeRoTraceMode.RECORD
 
+    def _clean_inflight_param_registry(self) -> None:
+        for param, handle in self.__inflight_param_registry.items():
+            handle.wait()
+            self.__release_param(param)
+        self.__inflight_param_registry.clear()
+
     def _invalidate_trace(self) -> None:
         if self.is_invalid_trace():
             raise RuntimeError("attempted to invalidate already invalid trace")
         self.__trace_mode = ZeRoTraceMode.INVALID
         self._clear_trace_structures()
+        self._clean_inflight_param_registry()
 
     def trace_prologue(self, sub_module: Module) -> None:
         if self.is_complete_trace():
@@ -204,9 +211,7 @@ def construct_parameter_trace_from_module_trace(self):
 
     def reset_step(self) -> None:
         """indicate that we have completed one fwd+bwd for the model"""
-        if self.__inflight_param_registry:
-            raise RuntimeError(f"still have inflight params "
-                               f"{[p.ds_summary() for p in self.__inflight_param_registry.keys()]}")
+        self._clean_inflight_param_registry()
 
         if not self.is_complete_trace():  # not self.trace_complete:
             # Make sure that recorded submodule orders are identical across ranks
@@ -409,7 +414,7 @@ def release_and_reset_all(self, module: Module) -> None:
         """release all module parameters"""
         for param in iter_params(module, recurse=True):
             if param in self.__inflight_param_registry:
-                raise RuntimeError(f"param {param.ds_summary()} still in flight")
+                self.__inflight_param_registry.pop(param).wait()
 
             # TODO. make this throw if if there are still active submodules. currently
             # there's a hook execution issue
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 796957a4c6e5a..e2c273fd913fb 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -6,10 +6,12 @@
 import sys
 import gc
 import collections
-from typing import Deque, Dict, Tuple
+import itertools
+from typing import Deque, Dict, Set, Tuple, Container
 from contextlib import contextmanager
+
 from deepspeed import comm as dist
-from deepspeed.utils import groups
+from deepspeed.utils import groups, z3_leaf_parameter
 
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from deepspeed.runtime.base_optimizer import ZeROOptimizer
@@ -19,9 +21,10 @@
 from deepspeed.runtime.utils import inf, is_model_parallel_parameter, get_only_unique_item
 from deepspeed.runtime.zero.partition_parameters import *
 from deepspeed.runtime.zero.config import ZeroStageEnum
-from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum, OffloadStateTypeEnum
 from deepspeed.runtime.zero.parameter_offload import DeepSpeedZeRoOffload
-from deepspeed.runtime.zero.utils import apply_to_tensors_only
+from deepspeed.runtime.zero.utils import apply_to_tensors_only, get_mapping_to_flat_buffer
+from deepspeed.runtime.zero.offload_states import offload_adam_states, reload_adam_states
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 from deepspeed.runtime.swap_tensor.partitioned_param_swapper import PartitionedParamStatus
 from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper
@@ -29,7 +32,6 @@
 from deepspeed.runtime.swap_tensor.pipelined_optimizer_swapper import PipelinedOptimizerSwapper
 from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, FP32_FLAT_GROUPS, PARTITION_COUNT, ZERO_STAGE, LOSS_SCALER
 from deepspeed.accelerator import get_accelerator
-from deepspeed.utils import z3_leaf_parameter
 
 # Toggle this to true to enable correctness test
 # with gradient partitioning and without
@@ -425,6 +427,8 @@ def __init__(
 
         self._link_all_hp_params()
 
+        self.offloaded_states: Set(OffloadDeviceEnum) = set()
+
         if dist.get_rank(group=self.dp_process_group) == 0:
             see_memory_usage(f"After initializing ZeRO optimizer", force=True)
 
@@ -563,21 +567,15 @@ def defragment(tensors: List[Tensor]) -> Tensor:
         cpu_buffer = torch.empty(sum(p.numel() for p in tensors),
                                  dtype=get_only_unique_item(t.dtype for t in tensors),
                                  device="cpu")
-        tensor_infos: List[Tuple[Tensor, int, int]] = []
+        tensor_infos: List[Tuple[Tensor, int, int]] = get_mapping_to_flat_buffer(tensors)
         orig_device = get_only_unique_item(t.device for t in tensors)
 
         offset = 0
-        for tensor in tensors:
-            tensor_numel = tensor.numel()
+        for tensor, offset, tensor_numel in tensor_infos:
             # move the tensor from device memory to host memory
             cpu_buffer.narrow(0, offset, tensor_numel).copy_(tensor)
             tensor.data = torch.empty(0, dtype=tensor.dtype, device=tensor.device)
 
-            # record some data so we can restore the device tensor later
-            tensor_infos.append((tensor, offset, tensor_numel))
-
-            offset += tensor_numel
-
         gc.collect()
         get_accelerator().empty_cache()
 
@@ -725,15 +723,11 @@ def _create_fp16_partitions_with_defragmentation(self, fp16_param_groups):
             for sub_group in self.fp16_groups:
                 for param in sub_group:
                     parameter_partitions.append(param.ds_tensor)
-            device_buffer = __class__.defragment(parameter_partitions)
 
-            # setup flat buffers per subgroup, these are each just sections of the
-            # contiguous flat buffer for all parameters that we created earlier
-            offset = 0
-            for sub_group in self.fp16_groups:
-                sub_group_numel = sum(param.partition_numel() for param in sub_group)
-                self.fp16_partitioned_groups_flat.append(device_buffer.narrow(0, offset, sub_group_numel))
-                offset += sub_group_numel
+            # We need to keep the reference to this buffer to make sure you can free it in `offload_states`
+            self.lp_param_buffer = __class__.defragment(parameter_partitions)
+            self._set_fp16_partitioned_groups_flat()
+
         else:  # partitioned params offloaded to CPU when not in use
             # create a flat CPU memory allocation for each param group
             self._create_param_groups_fp16_flat_cpu_memory()
@@ -1008,6 +1002,15 @@ def _partitioned_params_swap_out(self, i):
             swap_fp16_params[0].nvme_swapper.swap_out_partitioned_params(dst_fp16_params=swap_fp16_params,
                                                                          src_fp32_params=swap_fp32_params)
 
+    def _set_fp16_partitioned_groups_flat(self):
+        # setup flat buffers per subgroup, these are each just sections of the
+        # contiguous flat buffer for all parameters that we created earlier
+        offset = 0
+        for sub_group in self.fp16_groups:
+            sub_group_numel = sum(param.partition_numel() for param in sub_group)
+            self.fp16_partitioned_groups_flat.append(self.lp_param_buffer.narrow(0, offset, sub_group_numel))
+            offset += sub_group_numel
+
     def initialize_optimizer_states(self):
         num_subgroups = len(self.fp16_groups)
 
@@ -2296,6 +2299,24 @@ def get_fp32_grad_for_param(self, param) -> Tensor:
 
         return self._fp32_state_allgather(param, fp32_grad)
 
+    def set_fp32_grad_for_param(self, value, param):
+        if not param.requires_grad:
+            return
+
+        if not get_accelerator().resolves_data_dependency():
+            self.reduce_and_partition_stream.synchronize()
+
+        if self.offload_optimizer:
+            group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
+            fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(0, dest_offset, num_elements)
+        else:
+            fp32_grad = self.__param_id_to_grad_partition[param.ds_id]
+
+        my_rank = dist.get_rank(group=self.dp_process_group)
+        value_partition = value.flatten().narrow(0, fp32_grad.numel() * my_rank, fp32_grad.numel())
+
+        fp32_grad.data.copy_(value_partition.data)
+
     def _get_fp32_opt_state_partition(self, param, optim_state_key=None):
         if not get_accelerator().resolves_data_dependency():
             self.reduce_and_partition_stream.synchronize()
@@ -2344,12 +2365,6 @@ def set_full_hp_param(self, value, param, optim_state_key=None):
 
     ### Local API START ###
 
-    def get_local_fp32_param(self, param, optim_state_key=None) -> Tensor:
-        if not param.requires_grad:
-            return None
-        fp32_opt_state, group_idx = self._get_fp32_opt_state_partition(param, optim_state_key)
-        return fp32_opt_state
-
     def get_local_fp32_grad_for_param(self, param) -> Tensor:
         if not param.requires_grad:
             return None
@@ -2364,6 +2379,30 @@ def get_local_fp32_grad_for_param(self, param) -> Tensor:
             fp32_grad = self.__param_id_to_grad_partition[param.ds_id].float()
         return fp32_grad
 
+    def set_local_grad_for_param(self, value, param):
+        if not param.requires_grad:
+            return
+
+        assert value.numel() == param.ds_tensor.numel(
+        ), f" Number of elements do not match: {value.numel()} != {param.ds_tensor.ds_numel}"
+
+        if not get_accelerator().resolves_data_dependency():
+            self.reduce_and_partition_stream.synchronize()
+
+        if self.offload_optimizer:
+            group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
+            fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(0, dest_offset, num_elements)
+        else:
+            fp32_grad = self.__param_id_to_grad_partition[param.ds_id]
+
+        fp32_grad.data.copy_(value.flatten().data)
+
+    def get_local_fp32_param(self, param, optim_state_key=None) -> Tensor:
+        if not param.requires_grad:
+            return None
+        fp32_opt_state, group_idx = self._get_fp32_opt_state_partition(param, optim_state_key)
+        return fp32_opt_state
+
     def set_local_hp_param(self, value, param, optim_state_key=None):
         if not param.requires_grad:
             return
@@ -2378,7 +2417,7 @@ def set_local_hp_param(self, value, param, optim_state_key=None):
 
         if self._swappable_optimizer_subgroup(group_idx):
             self._optimizer_states_and_gradient_swap_out(group_idx)
-        logger.info(f"[set_local_hp_param][update the params' value successfully]")
+        # logger.info(f"[set_local_hp_param][update the params' value successfully]")
 
     ### Local API END ###
 
@@ -2782,6 +2821,149 @@ def checkpoint_event_epilogue(self):
     def empty_partition_cache(self):
         self.parameter_offload.empty_partition_cache()
 
+    def offload_states(self,
+                       include: Container[OffloadStateTypeEnum] = None,
+                       device: OffloadDeviceEnum = OffloadDeviceEnum.cpu,
+                       pin_memory: bool = True,
+                       non_blocking: bool = False):
+        device = device.value
+
+        self.empty_partition_cache()
+
+        assert self.optimizer.__class__ == deepspeed.ops.adam.fused_adam.FusedAdam, f"Offloading is supported only for DeepSpeed FusedAdam."
+
+        def needs_offload(target):
+            # return True
+            return target not in self.offloaded_states and (include == None or target in include)
+
+        # HP param
+        if needs_offload(OffloadStateTypeEnum.hp_params):
+            if pin_memory:
+                if not hasattr(self, "hp_params_pin_buffers"):
+                    self.hp_params_pin_buffers = [
+                        get_accelerator().pin_memory(torch.empty_like(t, device=device))
+                        for t in self.fp32_partitioned_groups_flat
+                    ]
+
+                for src_tensor, dest_buf in zip(self.fp32_partitioned_groups_flat, self.hp_params_pin_buffers):
+                    dest_buf.copy_(src_tensor, non_blocking=non_blocking)
+                    src_tensor.data = dest_buf
+            else:
+                for buf in self.fp32_partitioned_groups_flat:
+                    buf.data = buf.data.to(device, non_blocking=non_blocking)
+            self.offloaded_states.add(OffloadStateTypeEnum.hp_params)
+
+        # LP param
+        if needs_offload(OffloadStateTypeEnum.lp_params):
+            if pin_memory:
+                if not hasattr(self, "lp_param_contiguous_pin_buffer"):
+                    self.lp_param_contiguous_pin_buffer = get_accelerator().pin_memory(
+                        torch.empty_like(self.lp_param_buffer, device=device))
+                self.lp_param_contiguous_pin_buffer.copy_(self.lp_param_buffer, non_blocking=non_blocking)
+                cpu_buffer = self.lp_param_contiguous_pin_buffer
+            else:
+                cpu_buffer = self.lp_param_buffer.to(device, non_blocking=non_blocking)
+
+            self.lp_param_buffer.data = cpu_buffer
+            for tensor, offset, tensor_numel in get_mapping_to_flat_buffer(
+                [p.ds_tensor for p in self.module.parameters()]):
+                tensor.data = cpu_buffer.narrow(0, offset, tensor_numel)
+
+            self.fp16_partitioned_groups_flat.clear()
+            self.offloaded_states.add(OffloadStateTypeEnum.lp_params)
+
+        # LP grad
+        if needs_offload(OffloadStateTypeEnum.lp_grads):
+            if pin_memory:
+                if not hasattr(self, "lp_grad_partitions_flat_pin_buffers"):
+                    self.lp_grad_partitions_flat_pin_buffers = get_accelerator().pin_memory(
+                        torch.empty_like(self.grad_partitions_flat_buffer, device=device))
+                self.lp_grad_partitions_flat_pin_buffers.copy_(self.grad_partitions_flat_buffer,
+                                                               non_blocking=non_blocking)
+                self.grad_partitions_flat_buffer.data = self.lp_grad_partitions_flat_pin_buffers
+            else:
+                self.grad_partitions_flat_buffer.data = self.grad_partitions_flat_buffer.data.to(device)
+            self.averaged_gradients = {}
+
+            self.__param_id_to_grad_partition = {}
+
+            self.offloaded_states.add(OffloadStateTypeEnum.lp_grads)
+
+        # contiguous bucket
+        if needs_offload(OffloadStateTypeEnum.contiguous_grad_buffer):
+            if hasattr(self, "_DeepSpeedZeroOptimizer_Stage3__ipg_bucket_flat_buffer"):
+                # Record properties like shape, strides, etc. as a meta tensor
+                self.grad_buffer_meta = self.__ipg_bucket_flat_buffer.to("meta")
+                self.__ipg_bucket_flat_buffer = None
+                self.offloaded_states.add(OffloadStateTypeEnum.contiguous_grad_buffer)
+
+        # Adam
+        if needs_offload(OffloadStateTypeEnum.optim_states):
+            offload_adam_states(self.optimizer, device, pin_memory=pin_memory, non_blocking=non_blocking)
+            self.offloaded_states.add(OffloadStateTypeEnum.optim_states)
+
+        gc.collect()
+        get_accelerator().empty_cache()
+
+    def reload_states(self, non_blocking: bool = False):
+
+        device = get_accelerator().current_device_name()
+
+        # HP param
+        if OffloadStateTypeEnum.hp_params in self.offloaded_states:
+            if hasattr(self, "hp_params_pin_buffers"):
+                for src, dest in zip(self.hp_params_pin_buffers, self.fp32_partitioned_groups_flat):
+                    dest.data = src.to(device, non_blocking=non_blocking)
+            else:
+                for buf in self.fp32_partitioned_groups_flat:
+                    buf.data = buf.data.to(device, non_blocking=non_blocking)
+            self.offloaded_states.remove(OffloadStateTypeEnum.hp_params)
+
+        # LP Param
+        if OffloadStateTypeEnum.lp_params in self.offloaded_states:
+            cpu_buffer = self.lp_param_contiguous_pin_buffer if hasattr(
+                self, "lp_param_contiguous_pin_buffer") else self.lp_param_buffer
+            self.lp_param_buffer.data = cpu_buffer.data.to(device, non_blocking=non_blocking)
+            self._set_fp16_partitioned_groups_flat()
+
+            for tensor, offset, tensor_numel in get_mapping_to_flat_buffer(
+                [p.ds_tensor for p in self.module.parameters()]):
+                tensor.data = self.lp_param_buffer.narrow(0, offset, tensor_numel)
+            self.offloaded_states.remove(OffloadStateTypeEnum.lp_params)
+
+        # LP grad
+        if OffloadStateTypeEnum.lp_grads in self.offloaded_states:
+            if hasattr(self, "lp_grad_partitions_flat_pin_buffers"):
+                self.grad_partitions_flat_buffer.data = self.lp_grad_partitions_flat_pin_buffers.to(
+                    device, non_blocking=non_blocking)
+            else:
+                self.grad_partitions_flat_buffer.data = self.grad_partitions_flat_buffer.data.to(
+                    device, non_blocking=non_blocking)
+            self.averaged_gradients = {}
+
+            offset = 0
+            all_params = list(itertools.chain.from_iterable(self.fp16_groups))
+            for param in all_params:
+                self.__param_id_to_grad_partition[param.ds_id] = self.grad_partitions_flat_buffer.narrow(
+                    0, offset, param.partition_numel())
+                offset += param.partition_numel()
+
+            self.offloaded_states.remove(OffloadStateTypeEnum.lp_grads)
+
+        # contiguous bucket
+        if OffloadStateTypeEnum.contiguous_grad_buffer in self.offloaded_states:
+            self.__ipg_bucket_flat_buffer = torch.empty_like(self.grad_buffer_meta, device=device)
+            # self.__ipg_bucket_flat_buffer.data = self.__ipg_bucket_flat_buffer.data.to(device)
+            self.offloaded_states.remove(OffloadStateTypeEnum.contiguous_grad_buffer)
+
+        # Adam
+        if OffloadStateTypeEnum.optim_states in self.offloaded_states:
+            reload_adam_states(self.optimizer, device, non_blocking=non_blocking)
+            self.offloaded_states.remove(OffloadStateTypeEnum.optim_states)
+
+        if non_blocking:
+            get_accelerator().synchronize()
+
 
 def _handle_overflow(cpu_sum, x, i):
     import math
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index 83cf996ca0192..df7a2f83e3bcc 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -39,6 +39,7 @@
 OPTIMIZER_GRADIENTS_TIMER = 'optimizer_gradients'
 OPTIMIZER_STEP_TIMER = 'optimizer_step'
 OPTIMIZER_TIMERS = [OPTIMIZER_ALLGATHER_TIMER, OPTIMIZER_GRADIENTS_TIMER, OPTIMIZER_STEP_TIMER]
+INITIAL_MICRO_STEP_ID = -1
 
 
 def input(msg):
@@ -224,7 +225,7 @@ def __init__(self,
         self.gradient_predivide_factor = gradient_predivide_factor
         self.postscale_gradients = postscale_gradients
         self.gradient_accumulation_steps = gradient_accumulation_steps
-        self.micro_step_id = 0
+        self.micro_step_id = INITIAL_MICRO_STEP_ID
         self.ignore_unused_parameters = ignore_unused_parameters
         self.round_robin_gradients = round_robin_gradients
 
@@ -1231,9 +1232,7 @@ def copy_gradients_to_cpu():
 
         if self.micro_step_id > 0:
             accumulate_gradients()
-
-        # at the boundary we will send 32bit directly
-        if not self.is_gradient_accumulation_boundary:
+        else:
             copy_gradients_to_cpu()
 
     def set_norm_for_param_grad(self, param):
@@ -1824,7 +1823,7 @@ def step(self, closure=None):
         """
         Not supporting closure.
         """
-        self.micro_step_id = -1
+        self.micro_step_id = INITIAL_MICRO_STEP_ID
 
         see_memory_usage(f"In step before checking overflow")
 
diff --git a/deepspeed/runtime/zero/utils.py b/deepspeed/runtime/zero/utils.py
index 8f913d065934a..2d1cf17962d8a 100755
--- a/deepspeed/runtime/zero/utils.py
+++ b/deepspeed/runtime/zero/utils.py
@@ -4,7 +4,7 @@
 # DeepSpeed Team
 
 import os
-from typing import List
+from typing import List, Tuple
 
 import torch
 from deepspeed import comm as dist
@@ -160,3 +160,16 @@ def apply_to_tensors_only(function, value, warning_msg_fn=None):
                 logger.warning(warning_msg_fn(value))
                 warned = True
         return value
+
+
+def get_mapping_to_flat_buffer(tensors: List[torch.Tensor]) -> List[Tuple[torch.Tensor, int, int]]:
+    tensor_infos: List[Tuple[torch.Tensor, int, int]] = []
+
+    offset = 0
+    for tensor in tensors:
+        tensor_numel = tensor.numel()
+        # record some data so we can restore the device tensor later
+        tensor_infos.append((tensor, offset, tensor_numel))
+        offset += tensor_numel
+
+    return tensor_infos
diff --git a/deepspeed/utils/__init__.py b/deepspeed/utils/__init__.py
index 75fb6aa9d30ad..c6a202d485eba 100644
--- a/deepspeed/utils/__init__.py
+++ b/deepspeed/utils/__init__.py
@@ -12,10 +12,10 @@
 # TODO: Move tensor fragment and mixed precision to zero utils
 from .tensor_fragment import tensor_fragment, get_full_hp_param, get_hp_fragment_mapping, fragment_address, get_full_hp_grad, map_to_flat_opt_states
 from .tensor_fragment import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state
-from .tensor_fragment import set_full_hp_param
-from .tensor_fragment import safe_set_full_fp32_param, safe_set_full_optimizer_state
+from .tensor_fragment import set_full_hp_param, set_full_hp_grad
+from .tensor_fragment import safe_set_full_fp32_param, safe_set_full_optimizer_state, safe_set_full_grad
 from .tensor_fragment import safe_get_local_fp32_param, safe_get_local_grad, safe_get_local_optimizer_state
-from .tensor_fragment import safe_set_local_fp32_param, safe_set_local_optimizer_state
+from .tensor_fragment import safe_set_local_fp32_param, safe_set_local_grad, safe_set_local_optimizer_state
 from .z3_leaf_module import set_z3_leaf_modules, unset_z3_leaf_modules, get_z3_leaf_modules, z3_leaf_module, z3_leaf_parameter
 from .mixed_precision_linkage import link_hp_params, lazy_init_hp_params_optimizer_state
 from deepspeed.runtime.dataloader import RepeatingLoader
diff --git a/deepspeed/utils/logging.py b/deepspeed/utils/logging.py
index 1e62d96e10325..55fb72f6c9fba 100644
--- a/deepspeed/utils/logging.py
+++ b/deepspeed/utils/logging.py
@@ -7,6 +7,8 @@
 import logging
 import sys
 import os
+import torch
+from deepspeed.runtime.compiler import is_compile_supported
 
 log_levels = {
     "debug": logging.DEBUG,
@@ -19,6 +21,31 @@
 
 class LoggerFactory:
 
+    def create_warning_filter(logger):
+        warn = False
+
+        def warn_once(record):
+            nonlocal warn
+            if is_compile_supported() and torch.compiler.is_compiling() and not warn:
+                warn = True
+                logger.warning("To avoid graph breaks caused by logger in compile-mode, it is recommended to"
+                               " disable logging by setting env var DISABLE_LOGS_WHILE_COMPILING=1")
+            return True
+
+        return warn_once
+
+    @staticmethod
+    def logging_decorator(func):
+
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if torch.compiler.is_compiling():
+                return
+            else:
+                return func(*args, **kwargs)
+
+        return wrapper
+
     @staticmethod
     def create_logger(name=None, level=logging.INFO):
         """create a logger
@@ -44,6 +71,12 @@ def create_logger(name=None, level=logging.INFO):
         ch.setLevel(level)
         ch.setFormatter(formatter)
         logger_.addHandler(ch)
+        if os.getenv("DISABLE_LOGS_WHILE_COMPILING", "0") == "1":
+            for method in ['info', 'debug', 'error', 'warning', 'critical', 'exception']:
+                original_logger = getattr(logger_, method)
+                setattr(logger_, method, LoggerFactory.logging_decorator(original_logger))
+        else:
+            logger_.addFilter(LoggerFactory.create_warning_filter(logger_))
         return logger_
 
 
diff --git a/deepspeed/utils/mixed_precision_linkage.py b/deepspeed/utils/mixed_precision_linkage.py
index 7dea6ba322dbe..c97515ca8fef7 100644
--- a/deepspeed/utils/mixed_precision_linkage.py
+++ b/deepspeed/utils/mixed_precision_linkage.py
@@ -5,7 +5,7 @@
 
 import types
 from deepspeed.utils import get_full_hp_param, get_full_hp_grad, get_hp_fragment_mapping
-from deepspeed.utils import set_full_hp_param
+from deepspeed.utils import set_full_hp_param, set_full_hp_grad
 
 
 def link_hp_params(lp_param_list, flat_hp_partition, gradient_dict, offload_gradient_dict, use_offload,
@@ -35,6 +35,7 @@ def _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_gr
         lp_param.get_full_hp_param = types.MethodType(get_full_hp_param, lp_param)
         lp_param.get_full_hp_grad = types.MethodType(get_full_hp_grad, lp_param)
         lp_param.set_full_hp_param = types.MethodType(set_full_hp_param, lp_param)
+        lp_param.set_full_hp_grad = types.MethodType(set_full_hp_grad, lp_param)
 
         # lp_param overlaps with partition if both are true
         # 1) current_offset < partition_end,
diff --git a/deepspeed/utils/tensor_fragment.py b/deepspeed/utils/tensor_fragment.py
index b34722580ddd1..053c8b5adad06 100644
--- a/deepspeed/utils/tensor_fragment.py
+++ b/deepspeed/utils/tensor_fragment.py
@@ -57,6 +57,17 @@ def get_hp_fragment(self, optim_state_key=None):
             return self.hp_fragment
         return self.get_optim_state_fragment(optim_state_key)
 
+    def get_lp_grad_fragment(self, index_in_param_group):
+        if self.use_offload:
+            gradient_dict = self.offload_gradient_dict
+        else:
+            gradient_dict = self.gradient_dict
+
+        if self.param_group_index not in gradient_dict or gradient_dict[self.param_group_index] is None:
+            raise ValueError("Gradients are only available immediately after backward and before engine step")
+
+        return gradient_dict[self.param_group_index][index_in_param_group]
+
 
 def map_to_flat_opt_states(flat_hp_tensor, lp_tensors, optim_state, opt_keys):
     for key in opt_keys:
@@ -95,17 +106,7 @@ def set_full_hp_param(self, value, optim_state_key=None):
 def get_full_hp_grad(self):
     reduce_buffer = torch.zeros_like(self, dtype=torch.float32).flatten()
     if self._hp_mapping is not None:
-        hp_mapping = self._hp_mapping
-
-        if hp_mapping.use_offload:
-            gradient_dict = hp_mapping.offload_gradient_dict
-        else:
-            gradient_dict = hp_mapping.gradient_dict
-
-        if hp_mapping.param_group_index not in gradient_dict or gradient_dict[hp_mapping.param_group_index] is None:
-            raise ValueError("Gradients are only available immediately after backward and before engine step")
-
-        lp_grad_fragment = gradient_dict[hp_mapping.param_group_index][self._index_in_param_group]
+        lp_grad_fragment = self._hp_mapping.get_lp_grad_fragment(self._index_in_param_group)
         hp_grad_fragment = lp_grad_fragment.to(torch.float32).flatten()
 
         lp_frag_address = self._hp_mapping.lp_fragment_address
@@ -120,6 +121,14 @@ def get_full_hp_grad(self):
     return reduce_buffer.reshape_as(self)
 
 
+def set_full_hp_grad(self, value):
+    if self._hp_mapping is not None:
+        lp_grad_fragment = self._hp_mapping.get_lp_grad_fragment(self._index_in_param_group)
+        lp_frag_address = self._hp_mapping.lp_fragment_address
+        value_fragment = torch.narrow(value.flatten(), 0, lp_frag_address.start, lp_frag_address.numel)
+        lp_grad_fragment.data.copy_(value_fragment.data.reshape_as(lp_grad_fragment.data))
+
+
 def safe_get_full_fp32_param(param):
     """Assemble and return the fp32 parameter of a low-precision (e.g., fp16) parameter.
 
@@ -188,7 +197,10 @@ def safe_set_full_optimizer_state(param, value, optim_state_key):
 
 # TODO: Figure out the correct return dtype
 def safe_get_full_grad(param):
-    """Assemble and return the fp32 gradient of a low-precision (e.g., fp16) parameter.
+    """
+        Assemble and return the fp32 gradient of a low-precision (e.g., fp16) parameter.
+        The return data type is that used for gradient accumulation. This is usually the param data type,
+        but could also be different (e.g., bf16 param training with fp32 gradient accumulation).
 
         Args:
             param (``torch.nn.Parameter``): A model parameter
@@ -207,74 +219,95 @@ def safe_get_full_grad(param):
     return None
 
 
+def safe_set_full_grad(param, value):
+    """
+        Update the partitioned gradient of a low-precision (e.g., fp16) parameter.
+        To avoid precision issues, the update value should have the data type of
+        gradient accumulation.
+
+        Args:
+            param (``torch.nn.Parameter``): A model parameter
+            value (``torch.Tensor``): The un-partitioned new gradient value.
+    """
+    if param.grad is not None:
+        param.grad.copy_(value)
+    elif hasattr(param, 'ds_id'):
+        # ZeRO stage 3 param
+        param._z3_optimizer.set_fp32_grad_for_param(value, param)
+    elif hasattr(param, '_hp_mapping'):
+        # ZeRO stage 1, 2, and bf16_optimizer params
+        param.set_full_hp_grad(value)
+
+
 ### Local API  START ###
 def safe_get_local_grad(param):
-    """Get the fp32 gradient of a partitioned parameter.
+    """
+        Get the local gradient partition of a ZeRO-3 partitioned parameter.
+        The return data type is that used for gradient accumulation. This is usually the param data type,
+        but could also be different (e.g., bf16 param training with fp32 gradient accumulation).
         Args:
             param (``torch.nn.Parameter``): A model parameter
     """
-    if param.grad is not None:
-        return param.grad
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    return param._z3_optimizer.get_local_fp32_grad_for_param(param)
 
-    # ZeRO stage 3 param
-    if hasattr(param, 'ds_id'):
-        return param._z3_optimizer.get_local_fp32_grad_for_param(param)
 
-    return None
+def safe_set_local_grad(param, value):
+    """
+        Update the local gradient partition of a ZeRO-3 partitioned parameter.
+        To avoid precision issues, the update value should have the data type of
+        gradient accumulation.
+
+        Args:
+            param (``torch.nn.Parameter``): A model parameter.
+            value (``torch.Tensor``): New value of local gradient partition.
+    """
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    param._z3_optimizer.set_local_grad_for_param(value, param)
 
 
 def safe_get_local_fp32_param(param):
-    """Get the fp32 partitioned parameter.
+    """Get the local partition of a ZeRO-3 partitioned parameter in fp32 precision.
         Args:
-            param (``torch.nn.Parameter``): A model parameter
+            param (``torch.nn.Parameter``): A model parameter.
     """
-    # ZeRO stage 3 param
-    if hasattr(param, 'ds_id'):
-        return param._z3_optimizer.get_local_fp32_param(param)
-
-    return None
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    return param._z3_optimizer.get_local_fp32_param(param)
 
 
 def safe_get_local_optimizer_state(param, optim_state_key):
-    """Get the fp32 optimizer state of a partitioned parameter.
+    """Get the local optimizer state partition of ZeRO-3 partitioned parameter in fp32 precision.
         Args:
             param (``torch.nn.Parameter``): A model parameter
             optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer)
     """
-    # ZeRO stage 3 param
-    if hasattr(param, 'ds_id'):
-        return param._z3_optimizer.get_local_fp32_param(param, optim_state_key)
-
-    return None
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    return param._z3_optimizer.get_local_fp32_param(param, optim_state_key)
 
 
 def safe_set_local_optimizer_state(param, value, optim_state_key):
-    """Update the fp32 optimizer state of a partitioned parameter.
+    """Update the local optimizer state partition of a ZeRO-3 partitioned parameter.
         Args:
-            param (``torch.nn.Parameter``): A model parameter
-            value (``torch.Tensor``): New value
-            optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer)
+            param (``torch.nn.Parameter``): A model parameter.
+            value (``torch.Tensor``): New value of local optimizer state partition.
+            optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer).
     """
-    # ZeRO stage 3 param
-    if hasattr(param, 'ds_id'):
-        param._z3_optimizer.set_local_hp_param(value, param, optim_state_key)
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    param._z3_optimizer.set_local_hp_param(value, param, optim_state_key)
 
 
 def safe_set_local_fp32_param(param, value):
-    """Update the partitioned fp32 parameter.
+    """Update the local partition of ZeRO-3 partitioned parameter.
         Args:
-            param (``torch.nn.Parameter``): A model parameter
-            value (``torch.Tensor``): New value
+            param (``torch.nn.Parameter``): A model parameter.
+            value (``torch.Tensor``): New value of local parameter partition.
     """
-    # ZeRO stage 3 param
-    if hasattr(param, 'ds_id'):
-        param._z3_optimizer.set_local_hp_param(value, param)
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    param._z3_optimizer.set_local_hp_param(value, param)
 
 
 ### Local API  END ###
 
-# TODO: Implement API for setting ZeRO partitioned gradients
-
 
 def get_hp_fragment_mapping(lp_param, lp_start, flat_hp_partition, gradient_dict, offload_gradient_dict, use_offload,
                             param_group_index, partition_start, partition_size):
diff --git a/deepspeed/utils/timer.py b/deepspeed/utils/timer.py
index 00f17dea709c9..64ae8ac0e5b4f 100755
--- a/deepspeed/utils/timer.py
+++ b/deepspeed/utils/timer.py
@@ -198,7 +198,7 @@ def get_mean(self, names, normalizer=1.0, reset=True):
 
 class ThroughputTimer:
 
-    def __init__(self, config, batch_size, start_step=2, steps_per_output=50, monitor_memory=False, logging_fn=None):
+    def __init__(self, config, batch_size, start_step=2, steps_per_output=None, monitor_memory=False, logging_fn=None):
         from deepspeed.utils import logger
         self.config = config
         self.start_time = 0
@@ -238,6 +238,11 @@ def start(self):
                 get_accelerator().synchronize()
             self.start_time = time.time()
 
+    def _is_report_boundary(self):
+        if self.steps_per_output is None:
+            return False
+        return self.global_step_count % self.steps_per_output == 0
+
     def stop(self, global_step=False, report_speed=True):
         if not self.config.enabled or not self.started:
             return
@@ -255,7 +260,7 @@ def stop(self, global_step=False, report_speed=True):
             self.step_elapsed_time += duration
 
             if global_step:
-                if report_speed and self.global_step_count % self.steps_per_output == 0:
+                if report_speed and self._is_report_boundary():
                     self.logging(
                         "epoch={}/micro_step={}/global_step={}, RunningAvgSamplesPerSec={}, CurrSamplesPerSec={}, "
                         "MemAllocated={}GB, MaxMemAllocated={}GB".format(
diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py
index 24cc342e78d1a..e69ecd9acb5a2 100755
--- a/deepspeed/utils/zero_to_fp32.py
+++ b/deepspeed/utils/zero_to_fp32.py
@@ -10,7 +10,10 @@
 # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
 # application.
 #
-# example: python zero_to_fp32.py . pytorch_model.bin
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
 
 import argparse
 import torch
@@ -18,6 +21,8 @@
 import math
 import os
 import re
+import json
+from tqdm import tqdm
 from collections import OrderedDict
 from dataclasses import dataclass
 
@@ -139,7 +144,6 @@ def parse_model_states(files):
 
 
 def parse_optim_states(files, ds_checkpoint_dir):
-
     total_files = len(files)
     state_dicts = []
     for f in files:
@@ -420,12 +424,10 @@ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
     offset = 0
     total_numel = 0
     total_params = 0
-    for name, shape in param_shapes.items():
-
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering Sharded Weights'):
         unpartitioned_numel = shape.numel()
         total_numel += unpartitioned_numel
         total_params += 1
-
         partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
 
         if debug:
@@ -521,21 +523,75 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_f
     return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
 
 
-def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
     """
     Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
     loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
 
     Args:
         - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
-        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
         - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
         - ``exclude_frozen_parameters``: exclude frozen parameters
     """
-
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
     state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
-    print(f"Saving fp32 state dict to {output_file}")
-    torch.save(state_dict, output_file)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        state_dict_split = split_torch_state_dict_into_shards(state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard, output_path)
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
 
 
 def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
@@ -578,15 +634,27 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser()
     parser.add_argument("checkpoint_dir",
                         type=str,
                         help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
     parser.add_argument(
-        "output_file",
+        "--max_shard_size",
         type=str,
-        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
     parser.add_argument("-t",
                         "--tag",
                         type=str,
@@ -599,6 +667,8 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
     debug = args.debug
 
     convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
-                                               args.output_file,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
                                                tag=args.tag,
                                                exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/docs/_tutorials/automatic-tensor-parallelism.md b/docs/_tutorials/automatic-tensor-parallelism.md
old mode 100644
new mode 100755
index e1903ed058920..d5a08b27bf4db
--- a/docs/_tutorials/automatic-tensor-parallelism.md
+++ b/docs/_tutorials/automatic-tensor-parallelism.md
@@ -158,6 +158,7 @@ The following model families have been successfully tested with automatic tensor
 - plbart
 - qwen
 - qwen2
+- qwen2-moe
 - reformer
 - roberta
 - roformer
diff --git a/docs/_tutorials/deepnvme.md b/docs/_tutorials/deepnvme.md
index 480bcf2d95dfe..4ed528412eae9 100644
--- a/docs/_tutorials/deepnvme.md
+++ b/docs/_tutorials/deepnvme.md
@@ -50,7 +50,7 @@ Type "help", "copyright", "credits" or "license" for more information.
 >>> h = AsyncIOBuilder().load().aio_handle()
 >>> h.
 h.async_pread(             h.free_cpu_locked_tensor(  h.get_overlap_events(      h.get_single_submit(       h.new_cpu_locked_tensor(   h.pwrite(                  h.sync_pread(              h.wait(
-h.async_pwrite(            h.get_block_size(          h.get_queue_depth(         h.get_thread_count(        h.pread(                   h.read(                    h.sync_pwrite(             h.write(
+h.async_pwrite(            h.get_block_size(          h.get_queue_depth(         h.get_intra_op_parallelism(        h.pread(                   h.read(                    h.sync_pwrite(             h.write(
 ```
 The APIs of interest for performing I/O operations are those named with `pread` and `pwrite` substrings. For brevity, we will focus on the file write APIs, namely `sync_pwrite`, `async_pwrite`, and `pwrite`. We will discuss only `sync_pwrite` and `async_pwrite` below because they are specializations of `pwrite`.
 
@@ -107,7 +107,7 @@ Similar safety problems apply to reading the destination tensor of a non-blockin
 
 
 ### Parallel File Write
-An important DeepNVMe optimization is the ability to parallelize individual I/O operations. This optimization is enabled by specifying the desired parallelism degree when constructing a DeepNVMe handle. Subsequent I/O operations with that handle are automatically parallelized over the requested number of host or device threads, as appropriate. I/O parallelism is composable with either the blocking or non-blocking I/O APIs. The example below illustrates 4-way parallelism of a file write using `async_pwrite`. Note the use of `num_threads` argument to specify the desired parallelism degree in handle creation.
+An important DeepNVMe optimization is the ability to parallelize individual I/O operations. This optimization is enabled by specifying the desired parallelism degree when constructing a DeepNVMe handle. Subsequent I/O operations with that handle are automatically parallelized over the requested number of host or device threads, as appropriate. I/O parallelism is composable with either the blocking or non-blocking I/O APIs. The example below illustrates 4-way parallelism of a file write using `async_pwrite`. Note the use of `intra_op_parallelism` argument to specify the desired parallelism degree in handle creation.
 
 ```bash
 >>> import os
@@ -116,7 +116,7 @@ False
 >>> import torch
 >>> t=torch.empty(1024**3, dtype=torch.uint8).cuda()
 >>> from deepspeed.ops.op_builder import AsyncIOBuilder
->>> h = AsyncIOBuilder().load().aio_handle(num_threads=4)
+>>> h = AsyncIOBuilder().load().aio_handle(intra_op_parallelism=4)
 >>> h.async_pwrite(t,'/local_nvme/test_1GB.pt')
 >>> h.wait()
 1
@@ -188,7 +188,7 @@ This tutorial has been significantly improved by feedback from [Guanhua Wang](ht
 ## Appendix
 
 ### Advanced Handle Creation
-Achieving peak I/O performance with DeepNVMe requires careful configuration of handle creation. In particular, the parameters of `aio_handle` and `gds_handle` constructors are performance-critical because they determine how efficiently DeepNVMe interacts with the underlying storage subsystem (i.e., `libaio`, GDS, and SSD). For convenience we make it possible to create handles using default parameter values which will provide decent performance in most scenarios. However, squeezing out every available performance in your environment will likely require tuning the constructor parameters, namely `block_size`, `queue_depth`, `single_submit`, `overlap_events`, and `num_threads`. The `aio_handle` constructor parameters and default values are illustrated below:
+Achieving peak I/O performance with DeepNVMe requires careful configuration of handle creation. In particular, the parameters of `aio_handle` and `gds_handle` constructors are performance-critical because they determine how efficiently DeepNVMe interacts with the underlying storage subsystem (i.e., `libaio`, GDS, PCIe, and SSD). For convenience we make it possible to create handles using default parameter values which will provide decent performance in most scenarios. However, squeezing out every available performance in your environment will likely require tuning the constructor parameters, namely `block_size`, `queue_depth`, `single_submit`, `overlap_events`, and `intra_op_parallelism`. The `aio_handle` constructor parameters and default values are illustrated below:
 ```bash
 >>> from deepspeed.ops.op_builder import AsyncIOBuilder
 >>> help(AsyncIOBuilder().load().aio_handle())
@@ -203,11 +203,61 @@ class aio_handle(pybind11_builtins.pybind11_object)
  |  Methods defined here:
  |
  |  __init__(...)
- |      __init__(self: async_io.aio_handle, block_size: int = 1048576, queue_depth: int = 128, single_submit: bool = False, overlap_events: bool = False, num_threads: int = 1) -> None
+ |      __init__(self: async_io.aio_handle, block_size: int = 1048576, queue_depth: int = 128, single_submit: bool = False, overlap_events: bool = False, intra_op_parallelism: int = 1) -> None
  |
  |      AIO handle constructor
 ```
 
+### Performance Tuning
+As discussed [earlier](#advanced-handle-creation), achieving peak DeepNVMe performance for a target workload or environment requires using optimally configured `aio_handle` or `gds_handle` handles. For configuration convenience, we provide a utility called `ds_nvme_tune` to automate the discovery of optimal DeepNVMe configurations. `ds_nvme_tune` automatically explores a user-specified or default configuration space and recommends the option that provides the best read and write performance. Below is an example usage of `ds_nvme_tune` to tune `aio_handle` data transfers between GPU memory and a local NVVMe SSD mounted on `/local_nvme`. This example used the default configuration space of `ds_nvme_tune` for tuning.
+
+```bash
+$ ds_nvme_tune --nvme_dir /local_nvme --gpu
+Running DeepNVMe performance tuning on ['/local_nvme/']
+Best performance (GB/sec): read =  3.69, write =  3.18
+{
+   "aio": {
+      "single_submit": "false",
+      "overlap_events": "true",
+      "intra_op_parallelism": 8,
+      "queue_depth": 32,
+      "block_size": 1048576
+   }
+}
+```
+The above tuning was executed on a Lambda workstation equipped with two NVIDIA A6000-48GB GPUs, 252GB of DRAM, and a [CS3040 NVMe 2TB SDD](https://www.pny.com/CS3040-M2-NVMe-SSD?sku=M280CS3040-2TB-RB) with peak read and write speeds of 5.6 GB/s and 4.3 GB/s respectively. The tuning required about four and half minutes. Based on the results, one can expect to achieve read and write transfer speeds of 3.69 GB/sec and 3.18 GB/sec respectively by using an `aio_handle` configured as below.
+
+```python
+>>> from deepspeed.ops.op_builder import AsyncIOBuilder
+>>> h = AsyncIOBuilder().load().aio_handle(block_size=1048576,
+                                           queue_depth=32,
+                                           single_submit=False,
+                                           overlap_events=True,
+                                           intra_op_parallelism=8)
+```
+
+
+The full command line options of `ds_nvme_tune` can be obtained via the normal `-h` or `--help`.
+```bash
+usage: ds_nvme_tune [-h] --nvme_dir NVME_DIR [NVME_DIR ...] [--sweep_config SWEEP_CONFIG] [--no_read] [--no_write] [--io_size IO_SIZE] [--gpu] [--gds] [--flush_page_cache] [--log_dir LOG_DIR] [--loops LOOPS] [--verbose]
+
+options:
+  -h, --help            show this help message and exit
+  --nvme_dir NVME_DIR [NVME_DIR ...]
+                        Directory in which to perform I/O tests. A writeable directory on a NVMe device.
+  --sweep_config SWEEP_CONFIG
+                        Performance sweep configuration json file.
+  --no_read             Disable read performance measurements.
+  --no_write            Disable write performance measurements.
+  --io_size IO_SIZE     Number of I/O bytes to read/write for performance measurements.
+  --gpu                 Test tensor transfers between GPU device and NVME device.
+  --gds                 Run the sweep over NVIDIA GPUDirectStorage operator
+  --flush_page_cache    Page cache will not be flushed and reported read speeds may be higher than actual ***Requires sudo access***.
+  --log_dir LOG_DIR     Output directory for performance log files. Default is ./_aio_bench_logs
+  --loops LOOPS         Count of operation repetitions
+  --verbose             Print debugging information.
+```
+
 ### DeepNVMe APIs
 For convenience, we provide listing and brief descriptions of the DeepNVMe APIs.
 
@@ -242,6 +292,6 @@ Function | Description
 |---|---|
 get_queue_depth | Return queue depth setting |
 get_single_submit | Return whether single_submit is enabled |
-get_thread_count | Return I/O parallelism degree |
+get_intra_op_parallelism | Return I/O parallelism degree |
 get_block_size | Return I/O block size setting |
 get_overlap_events | Return whether overlap_event is enabled |
diff --git a/docs/code-docs/source/zero3.rst b/docs/code-docs/source/zero3.rst
index 2a6a48ca91db8..aa8139a654a10 100644
--- a/docs/code-docs/source/zero3.rst
+++ b/docs/code-docs/source/zero3.rst
@@ -369,13 +369,13 @@ These routines can be used in a training loop as shown in the following snippet.
     from deepspeed.utils import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state
     for n, lp in model.named_parameters():
         # 1. Access the full states
-        # 1) gradient lookup
+        #  1.1) gradient lookup
         # For zero1 and zero2, gradient lookup must be called after `backward` and before `step`
         # For zero3, gradient lookup must be called after `backward`
         hp_grad = safe_get_full_grad(lp)
 
 
-        # 2) fp32 and optim states can probably be called anywhere in the training loop, but will be updated after `step`
+        # 1.2) fp32 and optim states can probably be called anywhere in the training loop, but will be updated after `step`
         hp = safe_get_full_fp32_param(lp)
         exp_avg = safe_get_full_optimizer_state(lp, "exp_avg")
         exp_avg_sq = safe_get_full_optimizer_state(lp, "exp_avg_sq")
@@ -396,34 +396,39 @@ These routines can be used in a training loop as shown in the following snippet.
 Modifying Partitioned States
 ----------------------------
 
-Sometimes, a user may want to modify parameters or optimizer states outside of the regular training loop. This is currently difficult in ZeRO training because of partitioning. To overcome that, DeepSpeed provides the following routines for modifying the fp32 master parameters and the fp32 optimizer states.
+Sometimes, a user may want to modify parameters, gradients, or optimizer states outside of the regular training loop. This is currently difficult in ZeRO training because of partitioning. To overcome that, DeepSpeed provides the following routines for modifying the fp32 master parameters and the fp32 optimizer states.
 
 .. autofunction:: deepspeed.utils.safe_set_full_fp32_param
 
 .. autofunction:: deepspeed.utils.safe_set_full_optimizer_state
 
+.. autofunction:: deepspeed.utils.safe_set_full_grad
+
 .. autofunction:: deepspeed.utils.safe_set_local_fp32_param
 
+.. autofunction:: deepspeed.utils.safe_set_local_grad
+
 .. autofunction:: deepspeed.utils.safe_set_local_optimizer_state
 
-These routines can be used at any point after initialization of the DeepSpeed engine (i.e., ``deepspeed.initialize()``) as shown in the following snippet.
+The routines for modifying parameters and optimizer states can be used at any point after initialization of the DeepSpeed engine (i.e., ``deepspeed.initialize()``) as shown in the following snippet.
 
 .. code-block:: python
 
     [...]
+    from deepspeed.runtime.zero.utils import is_zero_param
     from deepspeed.utils import safe_set_full_fp32_param, safe_set_full_optimizer_state
     from deepspeed.utils import safe_set_local_fp32_param, safe_set_local_optimizer_state
     # Here is an example to zero all the fp32 parameters and optimizer states.
     for n, lp in model.named_parameters():
-        # 1. For zero stage 1 or 2, set the full fp32 and their full optim states
-        zero_tensor = torch.zeros_like(lp)
+        # 1. For zero stage 1, 2, or 3 set the full fp32 and their full optim states
+        zero_tensor = torch.zeros(lp.ds_shape) if is_zero_param(lp) else torch.zeros(lp.shape)
 
         safe_set_full_fp32_param(lp, zero_tensor)
         safe_get_full_optimizer_state(lp, zero_tensor, "exp_avg")
         safe_get_full_optimizer_state(lp, zero_tensor, "exp_avg_sq")
 
         # 2. For zero stage 3, each process sets its local fp32 parameters and their local optimizer states individually
-        zero_tensor_local = torch.zeros_like(lp.ds_tensor.shape)
+        zero_tensor_local = torch.zeros(lp.ds_tensor.shape)
 
         safe_set_local_fp32_param(lp, zero_tensor_local)
         safe_set_local_optimizer_state(lp, zero_tensor_local, "exp_avg")
@@ -432,6 +437,31 @@ These routines can be used at any point after initialization of the DeepSpeed en
     [...]
 
 
+The routines for modifying gradients can be used after ``backward`` but before ``step`` as shown in the following snippet.
+
+.. code-block:: python
+
+    backward(loss)
+    [...]
+    from deepspeed.runtime.zero.utils import is_zero_param
+    from deepspeed.utils import safe_set_full_grad, safe_set_local_grad
+    # Here is an example of how to zero all the gradients.
+    for n, lp in model.named_parameters():
+        # 1. For zero stage 1, 2, or 3 set the full gradient.
+        zero_tensor = torch.zeros(lp.ds_shape) if is_zero_param(lp) else torch.zeros(lp.shape)
+
+        safe_set_full_grad(lp, zero_tensor)
+
+        # 2. For zero stage 3, each process sets its local gradient partition.
+        zero_tensor_local = torch.zeros_like(lp.ds_tensor.shape)
+
+        safe_set_local_grad(lp, zero_tensor_local)
+
+    [...]
+    optimizer.step()
+
+
+
 GPU Memory Management
 ---------------------
 
@@ -456,3 +486,72 @@ The following code snippet illustrates this functionality.
 
     # Free GPU memory consumed by model parameters
     ds_engine.empty_partition_cache()
+
+
+Offload States
+--------------
+
+The DeepSpeed engine maintains a set of states in device memory (e.g., CUDA memory). The following API allows you to offload these states to a different device (currently, only CPU memory is supported), reducing the memory footprint on the device.
+
+.. code-block:: python
+
+    def offload_states(self,
+                       include: Container[OffloadStateTypeEnum] = None,
+                       device: OffloadDeviceEnum = OffloadDeviceEnum.cpu,
+                       pin_memory: bool = True,
+                       non_blocking: bool = False) -> None:
+        """Offload the engine's states to the specified device.
+
+        Arguments:
+            include: Optional. The set of states to offload. If not provided, all states are offloaded.
+            device: Optional. The device to move the ZeRO optimizer buffers to. Currently only `OffloadDeviceEnum.cpu` is supported.
+            pin_memory: Optional. Whether to pin the memory of the offloaded states.
+            non_blocking: Optional. Whether to offload the states asynchronously.
+        """
+
+You can selectively offload specific states by specifying the ``OffloadStateTypeEnum`` in the include argument. ``OffloadStateTypeEnum`` is an enum that defines the states that can be offloaded. The following states are supported:
+
+* ``OffloadStateTypeEnum.optim_states``: Optimizer states. Currently, only states of DeepSpeed's FusedAdam optimizer are supported.
+* ``OffloadStateTypeEnum.hp_params``: FP32 parameters.
+* ``OffloadStateTypeEnum.lp_params``: BF16/FP16 parameters.
+* ``OffloadStateTypeEnum.lp_grads``: BF16/FP16 gradients.
+* ``OffloadStateTypeEnum.contiguous_grad_buffer``: The contiguous gradient buffer for reduce operations.
+
+Note that offloading states comes with a trade-off between memory savings and computational overhead. This API allows states to be reloaded back into device memory when needed.
+
+.. code-block:: python
+
+    def reload_states(self, non_blocking: bool = False) -> None:
+        """Reload the engine states to the original device.
+
+        Arguments:
+            non_blocking: Optional. Whether to offload the states asynchronously.
+        """
+
+Below is an example code snippet demonstrating how to offload FP32 parameters and optimizer states to CPU memory:
+
+.. code-block:: python
+
+    # Offload after forward, backward, and step
+    ds_engine.offload_states(include=[OffloadStateTypeEnum.hp_params, OffloadStateTypeEnum.optim_states])
+
+    # Do something requiring a lot of device memory
+    ...
+    # Load states back to device memory
+    ds_engine.reload_states()
+
+``deepspeed.runtime.zero.offload_states.get_state_devices`` returns devices of the specified state.
+
+.. code-block:: python
+
+    def get_state_devices(model, state: OffloadStateTypeEnum) -> Set[torch.device]:
+        """Retrieve the devices of the specified state of the model.
+
+        Args:
+            model (DeepSpeedEngine): The model whose device allocations are to be checked.
+            state (OffloadStateTypeEnum): The specific state for which the devices should be retrieved.
+
+        Returns:
+            Set[torch.device]: A set of devices of the specified state.
+
+        """
diff --git a/op_builder/builder.py b/op_builder/builder.py
index e935a179f6afd..1609bc9005f46 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -76,7 +76,7 @@ def get_default_compute_capabilities():
 cuda_minor_mismatch_ok = {
     10: ["10.0", "10.1", "10.2"],
     11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"],
-    12: ["12.0", "12.1", "12.2", "12.3", "12.4", "12.5"],
+    12: ["12.0", "12.1", "12.2", "12.3", "12.4", "12.5", "12.6"],
 }
 
 
@@ -253,8 +253,7 @@ def get_rocm_gpu_arch():
             rocm_info = Path("rocminfo")
         rocm_gpu_arch_cmd = str(rocm_info) + " | grep -o -m 1 'gfx.*'"
         try:
-            safe_cmd = shlex.split(rocm_gpu_arch_cmd)
-            result = subprocess.check_output(safe_cmd)
+            result = subprocess.check_output(rocm_gpu_arch_cmd, shell=True)
             rocm_gpu_arch = result.decode('utf-8').strip()
         except subprocess.CalledProcessError:
             rocm_gpu_arch = ""
@@ -272,8 +271,7 @@ def get_rocm_wavefront_size():
         rocm_wavefront_size_cmd = str(
             rocm_info) + " | grep -Eo -m1 'Wavefront Size:[[:space:]]+[0-9]+' | grep -Eo '[0-9]+'"
         try:
-            safe_cmd = shlex.split(rocm_wavefront_size_cmd)
-            result = subprocess.check_output(rocm_wavefront_size_cmd)
+            result = subprocess.check_output(rocm_wavefront_size_cmd, shell=True)
             rocm_wavefront_size = result.decode('utf-8').strip()
         except subprocess.CalledProcessError:
             rocm_wavefront_size = "32"
diff --git a/op_builder/cpu/__init__.py b/op_builder/cpu/__init__.py
index 30238add3f905..7084db8469f1d 100644
--- a/op_builder/cpu/__init__.py
+++ b/op_builder/cpu/__init__.py
@@ -8,3 +8,4 @@
 from .fused_adam import FusedAdamBuilder
 from .cpu_adam import CPUAdamBuilder
 from .no_impl import NotImplementedBuilder
+from .async_io import AsyncIOBuilder
diff --git a/op_builder/cpu/async_io.py b/op_builder/cpu/async_io.py
new file mode 100644
index 0000000000000..493ef174566e7
--- /dev/null
+++ b/op_builder/cpu/async_io.py
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import distutils.spawn
+import subprocess
+
+from .builder import CPUOpBuilder
+
+
+class AsyncIOBuilder(CPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_AIO"
+    NAME = "async_io"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.aio.{self.NAME}_op'
+
+    def lib_sources(self):
+        src_list = [
+            'csrc/aio/py_lib/deepspeed_py_io_handle.cpp', 'csrc/aio/py_lib/deepspeed_py_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp', 'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
+            'csrc/aio/common/deepspeed_aio_utils.cpp', 'csrc/aio/common/deepspeed_aio_common.cpp',
+            'csrc/aio/common/deepspeed_aio_types.cpp', 'csrc/aio/py_lib/deepspeed_cpu_op.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp', 'csrc/aio/py_lib/deepspeed_py_copy.cpp',
+            'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
+        ]
+        return src_list
+
+    def sources(self):
+        return self.lib_sources() + ['csrc/aio/py_lib/py_ds_aio.cpp']
+
+    def include_paths(self):
+        return ['csrc/aio/py_lib', 'csrc/aio/common']
+
+    def cxx_args(self):
+        # -O0 for improved debugging, since performance is bound by I/O
+        args = super().cxx_args()
+        import torch
+        TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2])
+        if not (TORCH_MAJOR >= 2 and TORCH_MINOR >= 1):
+            args.remove('-std=c++17')
+            args.append('-std=c++14')
+        args += ['-Wall', '-O0', '-shared', '-fPIC', '-Wno-reorder']
+        return args
+
+    def extra_ldflags(self):
+        return ['-laio', '-fopenmp']
+
+    def check_for_libaio_pkg(self):
+        libs = dict(
+            dpkg=["-l", "libaio-dev", "apt"],
+            pacman=["-Q", "libaio", "pacman"],
+            rpm=["-q", "libaio-devel", "yum"],
+        )
+
+        found = False
+        for pkgmgr, data in libs.items():
+            flag, lib, tool = data
+            path = distutils.spawn.find_executable(pkgmgr)
+            if path is not None:
+                cmd = [pkgmgr, flag, lib]
+                result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                if result.wait() == 0:
+                    found = True
+                else:
+                    self.warning(f"{self.NAME}: please install the {lib} package with {tool}")
+                break
+        return found
+
+    def is_compatible(self, verbose=False):
+        # Check for the existence of libaio by using distutils
+        # to compile and link a test program that calls io_submit,
+        # which is a function provided by libaio that is used in the async_io op.
+        # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS
+        # respectively to specify the directories for libaio.h and libaio.so.
+        aio_compatible = self.has_function('io_submit', ('aio', ))
+        if verbose and not aio_compatible:
+            self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.")
+
+            # Check for the libaio package via known package managers
+            # to print suggestions on which package to install.
+            self.check_for_libaio_pkg()
+
+            self.warning(
+                "If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found."
+            )
+        return super().is_compatible(verbose) and aio_compatible
diff --git a/op_builder/hpu/__init__.py b/op_builder/hpu/__init__.py
index 6527ace087b5f..5ad1b9a7f891e 100644
--- a/op_builder/hpu/__init__.py
+++ b/op_builder/hpu/__init__.py
@@ -7,4 +7,5 @@
 
 from .cpu_adam import CPUAdamBuilder
 from .fused_adam import FusedAdamBuilder
+from .transformer_inference import InferenceBuilder
 from .no_impl import NotImplementedBuilder
diff --git a/op_builder/hpu/transformer_inference.py b/op_builder/hpu/transformer_inference.py
new file mode 100644
index 0000000000000..e397c99200ec9
--- /dev/null
+++ b/op_builder/hpu/transformer_inference.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+import importlib
+
+# DeepSpeed Team
+
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401
+    from op_builder.builder import OpBuilder
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+
+
+class InferenceBuilder(OpBuilder):
+    BUILD_VAR = "DS_BUILD_TRANSFORMER_INFERENCE"
+    NAME = "transformer_inference"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f"deepspeed.ops.transformer.inference.{self.NAME}_op"
+
+    def sources(self):
+        return []
+
+    def load(self, verbose=True):
+        if self.name in __class__._loaded_ops:
+            return __class__._loaded_ops[self.name]
+
+        from deepspeed.git_version_info import installed_ops  # noqa: F401
+        if installed_ops.get(self.name, False):
+            op_module = importlib.import_module(self.absolute_name())
+            __class__._loaded_ops[self.name] = op_module
+            return op_module
diff --git a/op_builder/mlu/__init__.py b/op_builder/mlu/__init__.py
new file mode 100644
index 0000000000000..db12afbbf20ec
--- /dev/null
+++ b/op_builder/mlu/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+'''Copyright The Microsoft DeepSpeed Team'''
+
+# MLU related operators will be added in the future.
+from .no_impl import NotImplementedBuilder
+from .cpu_adagrad import CPUAdagradBuilder
+from .cpu_adam import CPUAdamBuilder
+from .fused_adam import FusedAdamBuilder
diff --git a/op_builder/mlu/builder.py b/op_builder/mlu/builder.py
new file mode 100644
index 0000000000000..17b9723ffcc1e
--- /dev/null
+++ b/op_builder/mlu/builder.py
@@ -0,0 +1,35 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+    from op_builder.builder import OpBuilder
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+
+
+class MLUOpBuilder(OpBuilder):
+
+    def builder(self):
+        from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
+
+        compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())}
+
+        cpp_ext = ExtensionBuilder(name=self.absolute_name(),
+                                   sources=self.strip_empty_entries(self.sources()),
+                                   include_dirs=self.strip_empty_entries(self.include_paths()),
+                                   libraries=self.strip_empty_entries(self.libraries_args()),
+                                   extra_compile_args=compile_args)
+
+        return cpp_ext
+
+    def cxx_args(self):
+        return ['-O3', '-g', '-Wno-reorder']
+
+    def libraries_args(self):
+        return []
diff --git a/op_builder/mlu/cpu_adagrad.py b/op_builder/mlu/cpu_adagrad.py
new file mode 100644
index 0000000000000..68b7bbe514eea
--- /dev/null
+++ b/op_builder/mlu/cpu_adagrad.py
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import MLUOpBuilder
+
+
+class CPUAdagradBuilder(MLUOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAGRAD"
+    NAME = "cpu_adagrad"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adagrad.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adagrad/cpu_adagrad.cpp']
+
+    def include_paths(self):
+        return ['csrc/includes']
diff --git a/op_builder/mlu/cpu_adam.py b/op_builder/mlu/cpu_adam.py
new file mode 100644
index 0000000000000..b3c8e476bf396
--- /dev/null
+++ b/op_builder/mlu/cpu_adam.py
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import MLUOpBuilder
+
+
+class CPUAdamBuilder(MLUOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAM"
+    NAME = "cpu_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp']
+
+    def libraries_args(self):
+        args = super().libraries_args()
+        return args
+
+    def include_paths(self):
+        return ['csrc/includes']
diff --git a/op_builder/mlu/fused_adam.py b/op_builder/mlu/fused_adam.py
new file mode 100644
index 0000000000000..2343e55568c48
--- /dev/null
+++ b/op_builder/mlu/fused_adam.py
@@ -0,0 +1,43 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import MLUOpBuilder
+
+try:
+    import torch
+except ImportError as e:
+    pass
+
+
+class MLUFusedAdam:
+
+    @staticmethod
+    def multi_tensor_adam(chunk_size, noop_flag_buffer, tensor_lists, lr, beta1, beta2, epsilon, step, adam_w_mode,
+                          bias_correction, weight_decay, *args):
+
+        torch.ops.torch_mlu.fused_adam(noop_flag_buffer, tensor_lists[0], tensor_lists[1], tensor_lists[2],
+                                       tensor_lists[3], lr, beta1, beta2, epsilon, step, adam_w_mode, bias_correction,
+                                       weight_decay)
+
+
+class FusedAdamBuilder(MLUOpBuilder):
+    BUILD_VAR = "DS_BUILD_FUSED_ADAM"
+    NAME = "fused_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return []
+
+    def include_paths(self):
+        return []
+
+    def load(self, verbose=True):
+        return MLUFusedAdam
diff --git a/op_builder/mlu/no_impl.py b/op_builder/mlu/no_impl.py
new file mode 100644
index 0000000000000..375c148b4a5e0
--- /dev/null
+++ b/op_builder/mlu/no_impl.py
@@ -0,0 +1,34 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import MLUOpBuilder
+
+
+class NotImplementedBuilder(MLUOpBuilder):
+    BUILD_VAR = "DS_BUILD_NOT_IMPLEMENTED"
+    NAME = "deepspeed_not_implemented"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.comm.{self.NAME}_op'
+
+    def load(self, verbose=True):
+        raise ValueError("This op had not been implemented on MLU backend.")
+
+    def sources(self):
+        return []
+
+    def cxx_args(self):
+        return []
+
+    def extra_ldflags(self):
+        return []
+
+    def include_paths(self):
+        return []
diff --git a/op_builder/xpu/async_io.py b/op_builder/xpu/async_io.py
index 7ed527e016fa7..6a6798eaeb9cf 100644
--- a/op_builder/xpu/async_io.py
+++ b/op_builder/xpu/async_io.py
@@ -21,11 +21,18 @@ def absolute_name(self):
 
     def sources(self):
         return [
-            'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/py_ds_aio.cpp',
-            'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
-            'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp',
-            'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp',
-            'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
+            'csrc/aio/py_lib/deepspeed_py_copy.cpp',
+            'csrc/aio/py_lib/py_ds_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
+            'csrc/aio/common/deepspeed_aio_utils.cpp',
+            'csrc/aio/common/deepspeed_aio_common.cpp',
+            'csrc/aio/common/deepspeed_aio_types.cpp',
+            'csrc/aio/py_lib/deepspeed_pin_tensor.cpp',
+            'csrc/aio/py_lib/deepspeed_py_io_handle.cpp',
+            'csrc/xpu/aio/deepspeed_cpu_op.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp',
         ]
 
     def include_paths(self):
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 70c94a745435e..296398f680cc2 100755
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,4 +1,5 @@
 hjson
+msgpack
 ninja
 numpy
 packaging>=20.0
diff --git a/setup.py b/setup.py
index 1b6768d1b2c3d..e39d8c7e05a3c 100755
--- a/setup.py
+++ b/setup.py
@@ -298,7 +298,7 @@ def op_enabled(op_name):
 else:
     scripts = [
         'bin/deepspeed', 'bin/deepspeed.pt', 'bin/ds', 'bin/ds_ssh', 'bin/ds_report', 'bin/ds_bench', 'bin/dsr',
-        'bin/ds_elastic'
+        'bin/ds_elastic', 'bin/ds_nvme_tune', 'bin/ds_io'
     ]
 
 start_time = time.time()
diff --git a/tests/torch_compile/ds_config_z2.json b/tests/torch_compile/ds_config_z2.json
new file mode 100644
index 0000000000000..30e1237c558c9
--- /dev/null
+++ b/tests/torch_compile/ds_config_z2.json
@@ -0,0 +1,40 @@
+{
+  "train_batch_size": 8,
+  "steps_per_print": 2000,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [
+        0.8,
+        0.999
+      ],
+      "eps": 1e-8,
+      "weight_decay": 3e-7
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params": {
+      "warmup_min_lr": 0,
+      "warmup_max_lr": 0.001,
+      "warmup_num_steps": 1000
+    }
+  },
+  "gradient_clipping": 1.0,
+  "prescale_gradients": false,
+  "bf16": {
+      "enabled": true,
+      "loss_scale": 0,
+      "loss_scale_window": 500,
+      "hysteresis": 2,
+      "min_loss_scale": 1,
+      "initial_scale_power": 15
+  },
+  "wall_clock_breakdown": false,
+  "zero_optimization": {
+      "stage": 2,
+      "overlap_comm": false,
+      "contiguous_gradients": false
+  }
+}
diff --git a/tests/torch_compile/ds_config_z3.json b/tests/torch_compile/ds_config_z3.json
new file mode 100644
index 0000000000000..361bc115eaeeb
--- /dev/null
+++ b/tests/torch_compile/ds_config_z3.json
@@ -0,0 +1,41 @@
+{
+  "train_batch_size": 8,
+  "steps_per_print": 2000,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [
+        0.8,
+        0.999
+      ],
+      "eps": 1e-8,
+      "weight_decay": 3e-7
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params": {
+      "warmup_min_lr": 0,
+      "warmup_max_lr": 0.001,
+      "warmup_num_steps": 1000
+    }
+  },
+  "gradient_clipping": 1.0,
+  "prescale_gradients": false,
+  "bf16": {
+      "enabled": true,
+      "loss_scale": 0,
+      "loss_scale_window": 500,
+      "hysteresis": 2,
+      "min_loss_scale": 1,
+      "initial_scale_power": 15
+  },
+  "wall_clock_breakdown": false,
+  "zero_optimization": {
+      "stage": 3,
+      "reduce_scatter": true,
+      "overlap_comm": false,
+      "contiguous_gradients": false
+  }
+}
diff --git a/tests/torch_compile/test_compile.py b/tests/torch_compile/test_compile.py
new file mode 100644
index 0000000000000..adbf6eaa947aa
--- /dev/null
+++ b/tests/torch_compile/test_compile.py
@@ -0,0 +1,98 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import argparse
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed import comm
+
+import torch
+import intel_extension_for_pytorch  # noqa: F401 # type: ignore
+from torch.utils.data import Dataset, DataLoader
+
+torch._dynamo.config.cache_size_limit = 100
+
+
+def get_dynamo_stats():
+    return torch._dynamo.utils.counters["graph_break"]
+
+
+class RandomDataset(Dataset):
+
+    def __init__(self, size, length):
+        self.len = length
+        self.data = torch.randn(length, size).to(torch.bfloat16)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def __len__(self):
+        return self.len
+
+
+data_size = 1024
+data_length = 100
+rand_loader = DataLoader(dataset=RandomDataset(data_size, data_length), batch_size=1, shuffle=False)
+
+
+class MyModule(torch.nn.Module):
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.fc0 = torch.nn.Linear(1024, 256, bias=False)
+        self.fc1 = torch.nn.Linear(256, 256, bias=False)
+        self.dropout = torch.nn.Dropout(0.5)
+
+    def forward(self, data, residual):
+        output = residual + self.fc1(self.fc0(self.dropout(data))) * 0.5
+        return output
+
+
+model = MyModule()
+params = model.parameters()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher')
+parser.add_argument('--deepspeed_config',
+                    type=str,
+                    default='ds_config_z3.json',
+                    help='path to DeepSpeed configuration file')
+cmd_args = parser.parse_args()
+
+# initialize the DeepSpeed engine
+model_engine, optimizer, _, _ = deepspeed.initialize(args=cmd_args, model=model, model_parameters=params)
+model_engine.compile()
+
+residual = torch.rand(256, 256, dtype=torch.float).to(get_accelerator().current_device_name())
+
+start_stats = get_dynamo_stats()
+
+if comm.get_rank() == 0:
+    #print(dynamo_stats['graph_breaks'])
+    for item in start_stats.items():
+        print(item)
+
+for step, batch in enumerate(rand_loader):
+    if step % 10 == 0 and comm.get_rank() == 0:
+        print(f'step={step}')
+    # forward() method
+    loss = model_engine(batch.to(get_accelerator().current_device_name()), residual).sum()
+    # runs backpropagation
+    model_engine.backward(loss)
+    # weight update
+    model_engine.step()
+
+dynamo_stats = get_dynamo_stats()
+
+if comm.get_rank() == 0:
+    # print break down of graph break stats with markdown, print in table format, start with reason, then count
+    # print a tag 'dynamo_output' before each line to allow post processing
+    print("dynamo_output | Reason | Count |")
+    print("dynamo_output | ------ | ----- |")
+    for item in dynamo_stats.items():
+        # replace '|' in item[0] with a literal '|' to avoid mess with table format
+        item = (item[0].replace('|', r'\|'), item[1])
+        print(f"dynamo_output | {item[0]} | {item[1]} |")
+    print(f"dynamo_output | Total | {sum(dynamo_stats.values())} |")
diff --git a/tests/unit/common.py b/tests/unit/common.py
index c9eb7ffaa5f46..685f943df2fec 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -25,6 +25,8 @@
 # Worker timeout for tests that hang
 DEEPSPEED_TEST_TIMEOUT = int(os.environ.get('DS_UNITTEST_TIMEOUT', '600'))
 
+warn_reuse_dist_env = False
+
 
 def is_rocm_pytorch():
     return hasattr(torch.version, 'hip') and torch.version.hip is not None
@@ -145,16 +147,13 @@ class DistributedExec(ABC):
     def run(self):
         ...
 
-    def __call__(self, request=None):
+    def __call__(self, request):
         self._fixture_kwargs = self._get_fixture_kwargs(request, self.run)
         world_size = self.world_size
         if self.requires_cuda_env and not get_accelerator().is_available():
             pytest.skip("only supported in accelerator environments.")
 
-        if isinstance(world_size, int):
-            world_size = [world_size]
-        for procs in world_size:
-            self._launch_procs(procs)
+        self._launch_with_file_store(request, world_size)
 
     def _get_fixture_kwargs(self, request, func):
         if not request:
@@ -170,7 +169,7 @@ def _get_fixture_kwargs(self, request, func):
                 pass  # test methods can have kwargs that are not fixtures
         return fixture_kwargs
 
-    def _launch_daemonic_procs(self, num_procs):
+    def _launch_daemonic_procs(self, num_procs, init_method):
         # Create process pool or use cached one
         master_port = None
 
@@ -179,6 +178,13 @@ def _launch_daemonic_procs(self, num_procs):
                 print("Ignoring reuse_dist_env for hpu")
                 self.reuse_dist_env = False
 
+        global warn_reuse_dist_env
+        if self.reuse_dist_env and not warn_reuse_dist_env:
+            # Currently we see memory leak for tests that reuse distributed environment
+            print("Ignoring reuse_dist_env and forcibly setting it to False")
+            warn_reuse_dist_env = True
+        self.reuse_dist_env = False
+
         if self.reuse_dist_env:
             if num_procs not in self._pool_cache:
                 self._pool_cache[num_procs] = mp.Pool(processes=num_procs)
@@ -189,7 +195,7 @@ def _launch_daemonic_procs(self, num_procs):
             master_port = get_master_port()
 
         # Run the test
-        args = [(local_rank, num_procs, master_port) for local_rank in range(num_procs)]
+        args = [(local_rank, num_procs, master_port, init_method) for local_rank in range(num_procs)]
         skip_msgs_async = pool.starmap_async(self._dist_run, args)
 
         try:
@@ -209,7 +215,7 @@ def _launch_daemonic_procs(self, num_procs):
             assert len(set(skip_msgs)) == 1, "Multiple different skip messages received"
             pytest.skip(skip_msgs[0])
 
-    def _launch_non_daemonic_procs(self, num_procs):
+    def _launch_non_daemonic_procs(self, num_procs, init_method):
         assert not self.reuse_dist_env, "Cannot reuse distributed environment with non-daemonic processes"
 
         master_port = get_master_port()
@@ -218,7 +224,7 @@ def _launch_non_daemonic_procs(self, num_procs):
         prev_start_method = mp.get_start_method()
         mp.set_start_method('spawn', force=True)
         for local_rank in range(num_procs):
-            p = mp.Process(target=self._dist_run, args=(local_rank, num_procs, master_port, skip_msg))
+            p = mp.Process(target=self._dist_run, args=(local_rank, num_procs, master_port, init_method, skip_msg))
             p.start()
             processes.append(p)
         mp.set_start_method(prev_start_method, force=True)
@@ -260,7 +266,7 @@ def _launch_non_daemonic_procs(self, num_procs):
             # add a check here to assert all exit messages are equal
             pytest.skip(skip_msg.get())
 
-    def _launch_procs(self, num_procs):
+    def _launch_procs(self, num_procs, init_method):
         # Verify we have enough accelerator devices to run this test
         if get_accelerator().is_available() and get_accelerator().device_count() < num_procs:
             pytest.skip(
@@ -275,11 +281,11 @@ def _launch_procs(self, num_procs):
         mp.set_start_method('forkserver', force=True)
 
         if self.non_daemonic_procs:
-            self._launch_non_daemonic_procs(num_procs)
+            self._launch_non_daemonic_procs(num_procs, init_method)
         else:
-            self._launch_daemonic_procs(num_procs)
+            self._launch_daemonic_procs(num_procs, init_method)
 
-    def _dist_run(self, local_rank, num_procs, master_port, skip_msg=""):
+    def _dist_run(self, local_rank, num_procs, master_port, init_method, skip_msg=""):
         if not dist.is_initialized():
             """ Initialize deepspeed.comm and execute the user function. """
             if self.set_dist_env:
@@ -303,7 +309,10 @@ def _dist_run(self, local_rank, num_procs, master_port, skip_msg=""):
                 get_accelerator().set_device(local_rank)
 
             if self.init_distributed:
-                deepspeed.init_distributed(dist_backend=self.backend)
+                deepspeed.init_distributed(dist_backend=self.backend,
+                                           init_method=init_method,
+                                           rank=local_rank,
+                                           world_size=num_procs)
                 dist.barrier()
 
         try:
@@ -319,6 +328,22 @@ def _dist_run(self, local_rank, num_procs, master_port, skip_msg=""):
 
         return skip_msg
 
+    def _launch_with_file_store(self, request, world_size):
+        tmpdir = request.getfixturevalue("tmpdir")
+        dist_file_store = tmpdir.join("dist_file_store")
+        assert not os.path.exists(dist_file_store)
+        init_method = f"file://{dist_file_store}"
+
+        if isinstance(world_size, int):
+            world_size = [world_size]
+        for procs in world_size:
+            try:
+                self._launch_procs(procs, init_method)
+            finally:
+                if os.path.exists(dist_file_store):
+                    os.remove(dist_file_store)
+            time.sleep(0.5)
+
     def _dist_destroy(self):
         if (dist is not None) and dist.is_initialized():
             dist.barrier()
@@ -464,11 +489,7 @@ def __call__(self, request):
         else:
             world_size = self._fixture_kwargs.get("world_size", self.world_size)
 
-        if isinstance(world_size, int):
-            world_size = [world_size]
-        for procs in world_size:
-            self._launch_procs(procs)
-            time.sleep(0.5)
+        self._launch_with_file_store(request, world_size)
 
     def _get_current_test_func(self, request):
         # DistributedTest subclasses may have multiple test methods
diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
index 581a2ce433edc..9b563523dbebb 100644
--- a/tests/unit/inference/test_inference.py
+++ b/tests/unit/inference/test_inference.py
@@ -10,6 +10,7 @@
 import os
 import time
 import requests
+import fcntl
 
 from dataclasses import dataclass
 from typing import List
@@ -95,9 +96,12 @@ def _hf_model_list() -> List[ModelInfo]:
     if os.path.isfile(cache_file_path):
         with open(cache_file_path, 'rb') as f:
             try:
+                fcntl.flock(f, fcntl.LOCK_SH)
                 model_data = pickle.load(f)
             except Exception as e:
                 print(f"Error loading cache file {cache_file_path}: {e}")
+            finally:
+                fcntl.flock(f, fcntl.LOCK_UN)
 
     current_time = time.time()
 
@@ -125,7 +129,11 @@ def _hf_model_list() -> List[ModelInfo]:
         # Save the updated cache
         os.makedirs(cache_dir, exist_ok=True)
         with open(cache_file_path, 'wb') as f:
-            pickle.dump(model_data, f)
+            try:
+                fcntl.flock(f, fcntl.LOCK_EX)
+                pickle.dump(model_data, f)
+            finally:
+                fcntl.flock(f, fcntl.LOCK_UN)
 
     return model_data["model_list"]
 
diff --git a/tests/unit/launcher/test_multinode_runner.py b/tests/unit/launcher/test_multinode_runner.py
index ec0459ab0a6fe..a3b50a4c90ab2 100644
--- a/tests/unit/launcher/test_multinode_runner.py
+++ b/tests/unit/launcher/test_multinode_runner.py
@@ -32,6 +32,27 @@ def test_openmpi_runner(runner_info):
     runner = mnrunner.OpenMPIRunner(args, world_info, resource_pool)
     cmd = runner.get_cmd(env, resource_pool)
     assert cmd[0] == 'mpirun'
+    assert 'eth0' in cmd
+
+
+def test_btl_nic_openmpi_runner(runner_info):
+    env, resource_pool, world_info, _ = runner_info
+    args = parse_args(['--launcher_arg', '-mca btl_tcp_if_include eth1', 'test_launcher.py'])
+
+    runner = mnrunner.OpenMPIRunner(args, world_info, resource_pool)
+    cmd = runner.get_cmd(env, resource_pool)
+    assert 'eth0' not in cmd
+    assert 'eth1' in cmd
+
+
+def test_btl_nic_two_dashes_openmpi_runner(runner_info):
+    env, resource_pool, world_info, _ = runner_info
+    args = parse_args(['--launcher_arg', '--mca btl_tcp_if_include eth1', 'test_launcher.py'])
+
+    runner = mnrunner.OpenMPIRunner(args, world_info, resource_pool)
+    cmd = runner.get_cmd(env, resource_pool)
+    assert 'eth0' not in cmd
+    assert 'eth1' in cmd
 
 
 def test_mpich_runner(runner_info):
diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py
index e6927efc38248..a074cfca317fe 100644
--- a/tests/unit/ops/aio/test_aio.py
+++ b/tests/unit/ops/aio/test_aio.py
@@ -23,12 +23,10 @@
     pytest.skip('Skip tests since async-io is not compatible', allow_module_level=True)
 
 
-def _skip_for_invalid_environment(use_cuda_device=True, use_cuda_pinned_tensor=True):
-    if not get_accelerator().is_available():
-        if use_cuda_device:
-            pytest.skip("GPU tensors only supported in CUDA environments.")
+def _skip_for_invalid_environment(use_cuda_pinned_tensor=True):
+    if get_accelerator().device_name() != 'cuda':
         if use_cuda_pinned_tensor:
-            pytest.skip("CUDA-pinned tensors only supported in CUDA environments.")
+            pytest.skip("torch.pin_memory is only supported in CUDA environments.")
 
 
 def _get_local_rank():
@@ -52,13 +50,13 @@ def _get_test_write_file(tmpdir, index):
     return os.path.join(tmpdir, f'_aio_write_random_{file_suffix}.pt')
 
 
-def _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer, index=0):
+def _get_test_write_file_and_unpinned_tensor(tmpdir, ref_buffer, index=0):
     test_file = _get_test_write_file(tmpdir, index)
     test_buffer = get_accelerator().ByteTensor(list(ref_buffer))
     return test_file, test_buffer
 
 
-def _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, aio_handle=None, index=0):
+def _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffer, aio_handle=None, index=0):
     test_file = _get_test_write_file(tmpdir, index)
     if aio_handle is None:
         test_buffer = get_accelerator().pin_memory(torch.ByteTensor(list(ref_buffer)))
@@ -73,12 +71,12 @@ def _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, aio_handle=None, ind
 def _validate_handle_state(handle, single_submit, overlap_events):
     assert handle.get_single_submit() == single_submit
     assert handle.get_overlap_events() == overlap_events
-    assert handle.get_thread_count() == IO_PARALLEL
+    assert handle.get_intra_op_parallelism() == IO_PARALLEL
     assert handle.get_block_size() == BLOCK_SIZE
     assert handle.get_queue_depth() == QUEUE_DEPTH
 
 
-@pytest.mark.parametrize("use_cuda_pinned_tensor", [True])  # TODO: aio_handle pinned tensor API is broken
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
 @pytest.mark.parametrize("single_submit", [True, False])
 @pytest.mark.parametrize("overlap_events", [True, False])
 class TestRead(DistributedTest):
@@ -89,12 +87,15 @@ class TestRead(DistributedTest):
         init_distributed = False
         set_dist_env = False
 
-    def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events):
-        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    @pytest.mark.parametrize("use_unpinned_tensor", [True, False])
+    def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
-        if use_cuda_pinned_tensor:
+        if use_unpinned_tensor:
+            aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
+        elif use_cuda_pinned_tensor:
             aio_buffer = get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu'))
         else:
             aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, torch.empty(0, dtype=torch.uint8))
@@ -112,14 +113,14 @@ def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, over
         if not use_cuda_pinned_tensor:
             h.free_cpu_locked_tensor(aio_buffer)
 
-    @pytest.mark.parametrize("cuda_device", [True, False])
-    def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    @pytest.mark.parametrize("use_unpinned_tensor", [True, False])
+    def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         use_cpu_locked_tensor = False
         h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
-        if cuda_device:
+        if use_unpinned_tensor:
             aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
         elif use_cuda_pinned_tensor:
             aio_buffer = get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu'))
@@ -144,7 +145,7 @@ def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap
             h.free_cpu_locked_tensor(aio_buffer)
 
 
-@pytest.mark.parametrize("use_cuda_pinned_tensor", [True])  # TODO: aio_handle pinned tensor API is broken
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
 @pytest.mark.parametrize("single_submit", [True, False])
 @pytest.mark.parametrize("overlap_events", [True, False])
 class TestWrite(DistributedTest):
@@ -155,16 +156,19 @@ class TestWrite(DistributedTest):
         init_distributed = False
         set_dist_env = False
 
-    def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events):
-        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    @pytest.mark.parametrize("use_unpinned_tensor", [True, False])
+    def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_file, ref_buffer = _do_ref_write(tmpdir)
         h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
+        if use_unpinned_tensor:
+            aio_file, aio_buffer = _get_test_write_file_and_unpinned_tensor(tmpdir, ref_buffer)
         if use_cuda_pinned_tensor:
-            aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer)
+            aio_file, aio_buffer = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffer)
         else:
-            aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, h)
+            aio_file, aio_buffer = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffer, h)
 
         _validate_handle_state(h, single_submit, overlap_events)
 
@@ -179,20 +183,20 @@ def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, ove
         filecmp.clear_cache()
         assert filecmp.cmp(ref_file, aio_file, shallow=False)
 
-    @pytest.mark.parametrize("cuda_device", [True, False])
-    def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    @pytest.mark.parametrize("use_unpinned_tensor", [True, False])
+    def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_file, ref_buffer = _do_ref_write(tmpdir)
 
         h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
         use_cpu_locked_tensor = False
-        if cuda_device:
-            aio_file, aio_buffer = _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer)
+        if use_unpinned_tensor:
+            aio_file, aio_buffer = _get_test_write_file_and_unpinned_tensor(tmpdir, ref_buffer)
         elif use_cuda_pinned_tensor:
-            aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer)
+            aio_file, aio_buffer = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffer)
         else:
-            aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, h)
+            aio_file, aio_buffer = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffer, h)
             use_cpu_locked_tensor = True
 
         _validate_handle_state(h, single_submit, overlap_events)
@@ -213,8 +217,8 @@ def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overla
 
 
 @pytest.mark.sequential
-@pytest.mark.parametrize("use_cuda_pinned_tensor", [True])  # TODO: aio_handle pinned tensor API is broken
-@pytest.mark.parametrize("cuda_device", [True, False])
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
+@pytest.mark.parametrize("use_unpinned_tensor", [True, False])
 class TestAsyncQueue(DistributedTest):
     world_size = 1
     requires_cuda_env = False
@@ -223,8 +227,8 @@ class TestAsyncQueue(DistributedTest):
         set_dist_env = False
 
     @pytest.mark.parametrize("async_queue", [2, 3])
-    def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_files = []
         for i in range(async_queue):
@@ -236,7 +240,7 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
         h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         use_cpu_locked_tensor = False
-        if cuda_device:
+        if use_unpinned_tensor:
             aio_buffers = [
                 torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
                 for _ in range(async_queue)
@@ -270,8 +274,8 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
                 h.free_cpu_locked_tensor(t)
 
     @pytest.mark.parametrize("async_queue", [2, 3])
-    def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_files = []
         ref_buffers = []
@@ -287,16 +291,16 @@ def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device):
         aio_files = []
         aio_buffers = []
         for i in range(async_queue):
-            if cuda_device:
-                f, buf = _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffers[i], i)
+            if use_unpinned_tensor:
+                f, buf = _get_test_write_file_and_unpinned_tensor(tmpdir, ref_buffers[i], i)
             elif use_cuda_pinned_tensor:
-                f, buf = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffers[i], None, i)
+                f, buf = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffers[i], None, i)
             else:
-                f, buf = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffers[i], h, i)
+                f, buf = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffers[i], h, i)
             aio_files.append(f)
             aio_buffers.append(buf)
 
-        use_cpu_locked_tensor = not (cuda_device or use_cuda_pinned_tensor)
+        use_cpu_locked_tensor = not (use_unpinned_tensor or use_cuda_pinned_tensor)
 
         _validate_handle_state(h, single_submit, overlap_events)
 
diff --git a/tests/unit/ops/aio/test_gds.py b/tests/unit/ops/aio/test_gds.py
index 53655994b5601..e94d42cd22af8 100644
--- a/tests/unit/ops/aio/test_gds.py
+++ b/tests/unit/ops/aio/test_gds.py
@@ -54,7 +54,7 @@ def _get_test_write_file_and_device_buffer(tmpdir, ref_buffer, gds_handle, index
 def _validate_handle_state(handle, single_submit, overlap_events):
     assert handle.get_single_submit() == single_submit
     assert handle.get_overlap_events() == overlap_events
-    assert handle.get_thread_count() == IO_PARALLEL
+    assert handle.get_intra_op_parallelism() == IO_PARALLEL
     assert handle.get_block_size() == BLOCK_SIZE
     assert handle.get_queue_depth() == QUEUE_DEPTH
 
diff --git a/tests/unit/ops/transformer/inference/test_bias_add.py b/tests/unit/ops/transformer/inference/test_bias_add.py
index 843c9b889c2bf..f25bbc1be6925 100644
--- a/tests/unit/ops/transformer/inference/test_bias_add.py
+++ b/tests/unit/ops/transformer/inference/test_bias_add.py
@@ -8,12 +8,13 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer import DeepSpeedInferenceConfig
+from deepspeed.ops.transformer.inference.op_binding.bias_add import BiasAddOp
 from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
 torch_minor_version = None
 
 
@@ -22,15 +23,8 @@ def run_bias_add_reference(activations, bias):
 
 
 def run_bias_add_ds(activations, bias):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    if activations.dtype == torch.float16:
-        return inference_module.bias_add_fp16(activations, bias)
-    elif activations.dtype == torch.bfloat16:
-        return inference_module.bias_add_bf16(activations, bias)
-    else:
-        return inference_module.bias_add_fp32(activations, bias)
+    config = DeepSpeedInferenceConfig(dtype=activations.dtype)
+    return BiasAddOp(config)(activations, bias)
 
 
 @pytest.mark.inference_ops
diff --git a/tests/unit/ops/transformer/inference/test_bias_geglu.py b/tests/unit/ops/transformer/inference/test_bias_geglu.py
index d5ab13964974c..05de4fbb4cf8d 100644
--- a/tests/unit/ops/transformer/inference/test_bias_geglu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_geglu.py
@@ -8,13 +8,13 @@
 import deepspeed
 from deepspeed.ops.op_builder import InferenceBuilder
 from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.transformer.inference.op_binding.gated_activation import GatedActivationOp
 from deepspeed.utils.types import ActivationFuncType
 from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
 torch_minor_version = None
 
 
@@ -27,10 +27,7 @@ def run_bias_geglu_reference(activations, bias):
 
 
 def run_bias_geglu_ds(activation, bias):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.gated_activation(activation, bias, ActivationFuncType.GATED_GELU)
+    return GatedActivationOp()(activation, bias, ActivationFuncType.GATED_GELU)
 
 
 @pytest.mark.inference_ops
@@ -56,17 +53,14 @@ def run_gated_silu_reference(activations, bias):
 
 
 def run_gated_silu_ds(activation, bias):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.gated_activation(activation, bias, ActivationFuncType.GATED_SILU)
+    return GatedActivationOp()(activation, bias, ActivationFuncType.GATED_SILU)
 
 
 @pytest.mark.inference_ops
 @pytest.mark.parametrize("batch", [1, 2])
 @pytest.mark.parametrize("sequence", [1, 128, 255])
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+@pytest.mark.parametrize("dtype", get_dtypes())
 def test_gated_silu(batch, sequence, channels, dtype):
     activation = torch.randn((batch, sequence, channels * 2), dtype=dtype, device=get_accelerator().device_name())
     bias = torch.randn((channels * 2), dtype=dtype, device=get_accelerator().device_name())
diff --git a/tests/unit/ops/transformer/inference/test_bias_gelu.py b/tests/unit/ops/transformer/inference/test_bias_gelu.py
index fd82da51380c4..b69030e87ace5 100644
--- a/tests/unit/ops/transformer/inference/test_bias_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_gelu.py
@@ -8,13 +8,14 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer import DeepSpeedInferenceConfig
+from deepspeed.ops.transformer.inference.op_binding.bias_gelu import BiasGeluOp
 from .inference_test_utils import allclose, get_dtypes
 from packaging import version as pkg_version
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
 torch_minor_version = None
 
 
@@ -25,15 +26,8 @@ def run_bias_gelu_reference(activations, bias):
 
 
 def run_bias_gelu_ds(activations, bias):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    if activations.dtype == torch.float16:
-        return inference_module.bias_gelu_fp16(activations, bias)
-    elif activations.dtype == torch.bfloat16:
-        return inference_module.bias_gelu_bf16(activations, bias)
-    else:
-        return inference_module.bias_gelu_fp32(activations, bias)
+    config = DeepSpeedInferenceConfig(dtype=activations.dtype)
+    return BiasGeluOp(config)(activations, bias)
 
 
 @pytest.mark.inference_ops
diff --git a/tests/unit/ops/transformer/inference/test_bias_relu.py b/tests/unit/ops/transformer/inference/test_bias_relu.py
index 881af78e92cf6..57134665b2412 100644
--- a/tests/unit/ops/transformer/inference/test_bias_relu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_relu.py
@@ -8,12 +8,13 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer import DeepSpeedInferenceConfig
+from deepspeed.ops.transformer.inference.op_binding.bias_relu import BiasReluOp
 from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
 torch_minor_version = None
 
 
@@ -23,15 +24,8 @@ def run_bias_relu_reference(activations, bias):
 
 
 def run_bias_relu_ds(activations, bias):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    if activations.dtype == torch.float16:
-        return inference_module.bias_relu_fp16(activations, bias)
-    elif activations.dtype == torch.bfloat16:
-        return inference_module.bias_relu_bf16(activations, bias)
-    else:
-        return inference_module.bias_relu_fp32(activations, bias)
+    config = DeepSpeedInferenceConfig(dtype=activations.dtype)
+    return BiasReluOp(config)(activations, bias)
 
 
 @pytest.mark.inference_ops
diff --git a/tests/unit/ops/transformer/inference/test_gelu.py b/tests/unit/ops/transformer/inference/test_gelu.py
index 675860b00bdb9..beb74d09ab308 100644
--- a/tests/unit/ops/transformer/inference/test_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_gelu.py
@@ -7,11 +7,12 @@
 import torch
 import deepspeed
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer import DeepSpeedInferenceConfig
+from deepspeed.ops.transformer.inference.op_binding.bias_gelu import BiasGeluOp
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
 torch_minor_version = None
 
 
@@ -45,13 +46,8 @@ def run_gelu_ds(activations, use_triton_ops=False):
     device = deepspeed.accelerator.get_accelerator().device_name()
     channels = activations.shape[-1]
     bias = torch.zeros((channels), dtype=activations.dtype, device=device)
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    if activations.dtype == torch.float16:
-        return inference_module.bias_gelu_fp16(activations, bias)
-    else:
-        return inference_module.bias_gelu_fp32(activations, bias)
+    config = DeepSpeedInferenceConfig(dtype=activations.dtype)
+    return BiasGeluOp(config)(activations, bias)
 
 
 @pytest.mark.inference_ops
diff --git a/tests/unit/ops/transformer/inference/test_layer_norm.py b/tests/unit/ops/transformer/inference/test_layer_norm.py
index 9eac612aa29ca..2912807e9f43e 100644
--- a/tests/unit/ops/transformer/inference/test_layer_norm.py
+++ b/tests/unit/ops/transformer/inference/test_layer_norm.py
@@ -8,6 +8,7 @@
 import pytest
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer.inference.op_binding.layer_norm import LayerNormOp
 from .inference_test_utils import allclose, get_dtypes, assert_almost_equal
 try:
     import triton  # noqa: F401 # type: ignore
@@ -21,8 +22,6 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
-
 
 def ref_implementation(vals, gamma, beta, epsilon, channels, dtype):
     vals_f = vals.to(torch.float32)
@@ -32,10 +31,7 @@ def ref_implementation(vals, gamma, beta, epsilon, channels, dtype):
 
 
 def ds_implementation(vals, gamma, beta, epsilon):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.layer_norm(vals, gamma, beta, epsilon)
+    return LayerNormOp()(vals, gamma, beta, epsilon)
 
 
 def ds_triton_implementation(vals, gamma, beta, epsilon):
@@ -83,10 +79,7 @@ def residual_ref_implementation(vals, bias, res, gamma, beta, epsilon, channels,
 
 
 def residual_ds_implementation(vals, bias, res, gamma, beta, epsilon):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module._layer_norm_residual(vals, bias, res, gamma, beta, epsilon)
+    return LayerNormOp.layer_norm_residual(vals, bias, res, gamma, beta, epsilon)
 
 
 def residual_ds_triton_implementation(vals, bias, res, gamma, beta, epsilon):
@@ -137,10 +130,7 @@ def residual_store_ref_implementation(vals, bias, res, gamma, beta, epsilon, cha
 
 
 def residual_store_ds_implementation(vals, bias, res, gamma, beta, epsilon):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.layer_norm_residual_store_pre_ln_res(vals, bias, res, gamma, beta, epsilon)
+    return LayerNormOp.layer_norm_residual_store_pre_ln_res(vals, bias, res, gamma, beta, epsilon)
 
 
 @pytest.mark.inference_ops
diff --git a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
index e1c8127a83ace..dcf9f16baaf16 100644
--- a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
+++ b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
@@ -8,24 +8,20 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer.inference.op_binding.moe_res_matmul import MoEResMatmulOp
 from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
-
 
 def run_moe_res_matmul_reference(residual, coef1, coef2, output):
     return residual * coef1 + output * coef2
 
 
 def run_moe_res_matmul_ds(residual, coef, output):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
     coef_t = coef.transpose(-1, -2).contiguous()
-    return inference_module.moe_res_matmul(residual, coef_t, output)
+    return MoEResMatmulOp()(residual, coef_t, output)
 
 
 @pytest.mark.inference_ops
diff --git a/tests/unit/ops/transformer/inference/test_residual_add.py b/tests/unit/ops/transformer/inference/test_residual_add.py
index 91830e25fc819..807da4904341f 100644
--- a/tests/unit/ops/transformer/inference/test_residual_add.py
+++ b/tests/unit/ops/transformer/inference/test_residual_add.py
@@ -8,6 +8,8 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer import DeepSpeedInferenceConfig
+from deepspeed.ops.transformer.inference.op_binding import ResidualAddOp
 from .inference_test_utils import get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
@@ -36,11 +38,6 @@ def allclose(x, y):
     return torch.allclose(x, y, rtol=rtol, atol=atol)
 
 
-@pytest.fixture(scope="module")
-def inference_module():
-    return InferenceBuilder().load()
-
-
 def res_add_bias_ref(hidden_state, residual, attn_output, attn_bias, final_bias, mp_size=1, pre_attn_norm=True):
     if pre_attn_norm:
         hidden_state += (residual + final_bias + attn_output + attn_bias) / mp_size
@@ -75,8 +72,8 @@ def run_residual_add_reference(hidden_state, residual, attn_output, attn_bias, f
 @pytest.mark.parametrize("mp_size", [1, 2])
 @pytest.mark.parametrize("pre_attn_norm", [True, False])
 @pytest.mark.parametrize("use_triton_ops", [True, False])
-def test_residual_add(inference_module, batch, sequence, hidden_dim, dtype, mlp_after_attn, add_bias, mp_size,
-                      pre_attn_norm, use_triton_ops):
+def test_residual_add(batch, sequence, hidden_dim, dtype, mlp_after_attn, add_bias, mp_size, pre_attn_norm,
+                      use_triton_ops):
     if not deepspeed.HAS_TRITON and use_triton_ops:
         pytest.skip("triton has to be installed for the test")
     ds_out = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
@@ -96,19 +93,9 @@ def test_residual_add(inference_module, batch, sequence, hidden_dim, dtype, mlp_
     if use_triton_ops:
         from deepspeed.ops.transformer.inference.triton import residual_add_bias
         ds_out = residual_add_bias(*res_add_args)
-    if dtype == torch.float16:
-        ds_out = inference_module.residual_add_bias_fp16(*res_add_args)
-    elif dtype == torch.float32:
-        ds_out = inference_module.residual_add_bias_fp32(*res_add_args)
-    elif dtype == torch.bfloat16:
-        ds_out = inference_module.residual_add_bias_bf16(*res_add_args)
     else:
-        if dtype == torch.float16:
-            ds_out = inference_module.residual_add_bias_fp16(*res_add_args)
-        elif dtype == torch.float32:
-            ds_out = inference_module.residual_add_bias_fp32(*res_add_args)
-        else:
-            raise ValueError(f"Unsupported dtype: {dtype}")
+        config = DeepSpeedInferenceConfig(dtype=dtype)
+        ds_out = ResidualAddOp(config).residual_add_func(*res_add_args)
 
     if not allclose(ds_out, ref_out):
         print((ds_out - ref_out).abs().max())
diff --git a/tests/unit/ops/transformer/inference/test_rms_norm.py b/tests/unit/ops/transformer/inference/test_rms_norm.py
index 508a40e12e8df..fde9c9510771a 100644
--- a/tests/unit/ops/transformer/inference/test_rms_norm.py
+++ b/tests/unit/ops/transformer/inference/test_rms_norm.py
@@ -8,13 +8,13 @@
 import pytest
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder  # type: ignore
+from deepspeed.ops.transformer.inference.op_binding.pre_rms_norm import PreRMSNormOp
+from deepspeed.ops.transformer.inference.op_binding.rms_norm import RMSNormOp
 from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
-
 
 def ref_implementation(vals, gamma, epsilon):
     variance = vals.to(torch.float32).pow(2).mean(-1, keepdim=True)
@@ -27,10 +27,7 @@ def ref_implementation(vals, gamma, epsilon):
 
 
 def ds_implementation(vals, gamma, epsilon):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.rms_norm(vals, gamma, epsilon)
+    return RMSNormOp()(vals, gamma, epsilon)
 
 
 @pytest.mark.inference_ops
@@ -51,10 +48,7 @@ def test_rms_norm(batch, seq_len, channels, dtype):
 
 
 def pre_ds_implementation(vals, residual, gamma, epsilon):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.pre_rms_norm(vals, residual, gamma, epsilon)
+    return PreRMSNormOp()(vals, residual, gamma, epsilon)
 
 
 def pre_ref_implementation(vals, residual, gamma, epsilon):
diff --git a/tests/unit/ops/transformer/inference/test_softmax.py b/tests/unit/ops/transformer/inference/test_softmax.py
index 7d0d6e14b6513..4f6d69160aa7f 100644
--- a/tests/unit/ops/transformer/inference/test_softmax.py
+++ b/tests/unit/ops/transformer/inference/test_softmax.py
@@ -11,7 +11,6 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
 torch_minor_version = None
 
 
@@ -43,6 +42,7 @@ def run_softmax_ds(input, use_triton_ops=False):
 def test_softmax(batch, sequence, channels, dtype, use_triton_ops):
     if not deepspeed.HAS_TRITON and use_triton_ops:
         pytest.skip("triton has to be installed for the test")
+
     device = deepspeed.accelerator.get_accelerator().device_name()
     input_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=device)
     input_ref = input_ds.clone().detach()
diff --git a/tests/unit/runtime/test_lr_schedulers.py b/tests/unit/runtime/test_lr_schedulers.py
index bcfc485f2b8f5..47734c0cd864d 100644
--- a/tests/unit/runtime/test_lr_schedulers.py
+++ b/tests/unit/runtime/test_lr_schedulers.py
@@ -37,6 +37,9 @@ def _verify_staircase_increase(values, step_size):
                                                    (WARMUP_DECAY_LR, {
                                                        WARMUP_NUM_STEPS: 10,
                                                        TOTAL_NUM_STEPS: 20
+                                                   }), (WARMUP_COSINE_LR, {
+                                                       WARMUP_NUM_STEPS: 10,
+                                                       TOTAL_NUM_STEPS: 20
                                                    }), (ONE_CYCLE, {
                                                        CYCLE_MIN_LR: 0,
                                                        CYCLE_MAX_LR: 0.1
@@ -71,6 +74,11 @@ def test(self, scheduler_type, params):
                                         hidden_dim=hidden_dim,
                                         device=model.device,
                                         dtype=torch.float)
+
+        true_lrs = lr_scheduler.get_lr()
+        for group, true_lr in zip(model.optimizer.param_groups, true_lrs):
+            assert group['lr'] == true_lr, f"True lr {true_lr}, optimizer lr {group['lr']}"
+
         for n, batch in enumerate(data_loader):
             # get lr before training starts
             lr_scheduler.get_lr()
diff --git a/tests/unit/runtime/zero/test_nvme_checkpointing.py b/tests/unit/runtime/zero/test_nvme_checkpointing.py
index 75cba2e789c16..850c8eb3e3496 100644
--- a/tests/unit/runtime/zero/test_nvme_checkpointing.py
+++ b/tests/unit/runtime/zero/test_nvme_checkpointing.py
@@ -15,6 +15,7 @@
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.runtime.zero.partition_parameters import Init
 from deepspeed.ops.aio import AsyncIOBuilder
+from deepspeed.accelerator import get_accelerator
 
 
 class TestNVMeCheckpointing(DistributedTest):
@@ -29,6 +30,9 @@ def test_nvme_checkpointing(self, tmpdir, param_offload_device, optim_offload_de
 
         first_stage_steps, second_stage_steps = 2, 2
 
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
+
         if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]:
             pytest.skip('Skip tests since async-io is not compatible')
 
diff --git a/tests/unit/runtime/zero/test_offload_states.py b/tests/unit/runtime/zero/test_offload_states.py
new file mode 100644
index 0000000000000..9105a54661fac
--- /dev/null
+++ b/tests/unit/runtime/zero/test_offload_states.py
@@ -0,0 +1,128 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+
+import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
+import torch
+
+from unit.common import DistributedTest
+from unit.simple_model import random_dataloader, SimpleModel
+
+import deepspeed
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum, OffloadStateTypeEnum
+from deepspeed.utils import safe_get_local_fp32_param, safe_get_local_optimizer_state
+from deepspeed.runtime.zero.offload_states import get_state_devices
+
+
+def validate_device(model, device: torch.device, include) -> None:
+
+    def compare_device(state) -> bool:
+        devices = get_state_devices(model, state)
+        return len(devices) == 1 and device in devices
+
+    for state in OffloadStateTypeEnum:
+        if include is None or state in include:
+            if state == OffloadStateTypeEnum.contiguous_grad_buffer and device == torch.device("cpu"):
+                assert len(get_state_devices(model,
+                                             state)) == 0, f"State {state} must be removed after offload_states()"
+            else:
+                assert compare_device(state), f"State {state} is not on device {device}"
+
+
+def run_model(model, config_dict, hidden_dim, dtype, include, pin_memory, non_blocking):
+    # Currently we only support OffloadDeviceEnum.cpu
+    offload_device = OffloadDeviceEnum.cpu
+
+    model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+    data_loader = random_dataloader(model=model,
+                                    total_samples=10,
+                                    hidden_dim=hidden_dim,
+                                    device=model.device,
+                                    dtype=dtype)
+    dist.barrier()
+    for batch in data_loader:
+        loss = model(batch[0], batch[1])
+        model.backward(loss)
+        model.step()
+
+        hp_params_expected = [safe_get_local_fp32_param(p).clone() for p in model.parameters()]
+        lp_params_expected = [p.ds_tensor.clone() for p in model.parameters()]
+        lp_grads_expected = model.optimizer.grad_partitions_flat_buffer.clone()
+        adam_exp_avg_expected = [safe_get_local_optimizer_state(p, "exp_avg").clone() for p in model.parameters()]
+        adam_exp_avg_sq = [safe_get_local_optimizer_state(p, "exp_avg_sq").clone() for p in model.parameters()]
+
+        # Start offloading
+        alloc_before_offload = get_accelerator().memory_allocated()
+        model.offload_states(include=include, device=offload_device, pin_memory=pin_memory, non_blocking=non_blocking)
+        alloc_after_offload = get_accelerator().memory_allocated()
+        assert alloc_after_offload < alloc_before_offload, f"Allocated memory should decrease after offload"
+
+        validate_device(model, torch.device(offload_device.value), include)
+
+        # Reload states
+        model.reload_states()
+        assert alloc_after_offload < get_accelerator().memory_allocated(
+        ), f"Allocated memory should increase after offload back"
+
+        # Verify restored states
+        hp_param_restored = [safe_get_local_fp32_param(p) for p in model.parameters()]
+        for hp_param_expected, hp_param_restored in zip(hp_params_expected, hp_param_restored):
+            assert torch.equal(hp_param_expected, hp_param_restored)
+
+        lp_param_restored = [p.ds_tensor for p in model.parameters()]
+
+        for lp_param_expected, lp_param_restored in zip(lp_params_expected, lp_param_restored):
+            assert torch.equal(lp_param_expected, lp_param_restored)
+
+        assert torch.equal(lp_grads_expected, model.optimizer.grad_partitions_flat_buffer)
+
+        adam_exp_avg_restored = [safe_get_local_optimizer_state(p, "exp_avg") for p in model.parameters()]
+        for adam_exp_avg_expected, adam_exp_avg_restored in zip(adam_exp_avg_expected, adam_exp_avg_restored):
+            assert torch.equal(adam_exp_avg_expected, adam_exp_avg_restored)
+
+        adam_exp_avg_sq_restored = [safe_get_local_optimizer_state(p, "exp_avg_sq") for p in model.parameters()]
+        for adam_exp_avg_sq_expected, adam_exp_avg_sq_restored in zip(adam_exp_avg_sq, adam_exp_avg_sq_restored):
+            assert torch.equal(adam_exp_avg_sq_expected, adam_exp_avg_sq_restored)
+
+        validate_device(model, torch.device(get_accelerator().current_device_name()), include)
+
+    # Needed in ZeRO 3. Not doing so can give memory leak
+    model.destroy()
+
+
+@pytest.mark.parametrize("included_state", [
+    OffloadStateTypeEnum.hp_params, OffloadStateTypeEnum.lp_params, OffloadStateTypeEnum.optim_states,
+    OffloadStateTypeEnum.lp_grads, OffloadStateTypeEnum.contiguous_grad_buffer, None
+])
+@pytest.mark.parametrize("pin_memory", [False, True])
+@pytest.mark.parametrize("non_blocking", [False, True])
+class TestOffloadStates(DistributedTest):
+    # Need multiple gpus to test possible hanging
+    world_size = 2
+
+    def test_offload_states(self, included_state, pin_memory, non_blocking):
+        hidden_dim = 1024
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-6
+                }
+            },
+            "zero_optimization": {
+                "stage": 3,
+            }
+        }
+        config_dict["bf16"] = {"enabled": True}
+
+        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            model = SimpleModel(hidden_dim, nlayers=4)
+
+        include = None if included_state is None else [included_state]
+        run_model(model, config_dict, hidden_dim, torch.bfloat16, include, pin_memory, non_blocking)
diff --git a/tests/unit/runtime/zero/test_zero_tensor_fragment.py b/tests/unit/runtime/zero/test_zero_tensor_fragment.py
index 3bb4af3e3d912..2e3a652668ed5 100644
--- a/tests/unit/runtime/zero/test_zero_tensor_fragment.py
+++ b/tests/unit/runtime/zero/test_zero_tensor_fragment.py
@@ -13,9 +13,9 @@
 
 import deepspeed
 from deepspeed.utils import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state
-from deepspeed.utils import safe_set_full_fp32_param, safe_set_full_optimizer_state
+from deepspeed.utils import safe_set_full_fp32_param, safe_set_full_grad, safe_set_full_optimizer_state
 from deepspeed.utils import safe_get_local_fp32_param, safe_get_local_grad, safe_get_local_optimizer_state
-from deepspeed.utils import safe_set_local_fp32_param, safe_set_local_optimizer_state
+from deepspeed.utils import safe_set_local_fp32_param, safe_set_local_grad, safe_set_local_optimizer_state
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.ops.aio import AsyncIOBuilder
 from deepspeed.accelerator import get_accelerator
@@ -23,6 +23,7 @@
 WEIGHT_KEY = 'weight'
 FIRST_ORDER_KEY = 'exp_avg'
 SECOND_ORDER_KEY = 'exp_avg_sq'
+GRADIENT_KEY = 'gradient'
 
 
 def validate_tensor(model, api_type, opt_states):
@@ -180,13 +181,14 @@ def test_bf16_fragments(self, frozen_weights):
         run_fragmented_model(model, config_dict, hidden_dim, torch.bfloat16, validate_after_bwd, validate_after_step)
 
 
-def create_random_values(model, key_list, group, use_cuda=True):
+def create_random_values(model, key_list, group, grad_dtype, use_cuda=True):
     param_values = {}
     for n, lp in model.named_parameters():
         param_shape = lp.ds_shape if hasattr(lp, 'ds_id') else lp.shape
         param_values[n] = {}
         for key in key_list:
-            rand_value = torch.rand(param_shape, dtype=torch.float32, device=model.device)
+            dtype = grad_dtype if key == GRADIENT_KEY else torch.float32
+            rand_value = torch.rand(param_shape, dtype=dtype, device=model.device)
             dist.broadcast(rand_value, src=0, group=group)
             param_values[n][key] = rand_value
     return param_values
@@ -195,7 +197,9 @@ def create_random_values(model, key_list, group, use_cuda=True):
 def set_param_values_with_dict(model, value_dict):
     for n, lp in model.named_parameters():
         for key, value_tensor in value_dict[n].items():
-            if key == WEIGHT_KEY:
+            if key == GRADIENT_KEY:
+                safe_set_full_grad(lp, value_tensor)
+            elif key == WEIGHT_KEY:
                 safe_set_full_fp32_param(lp, value_tensor)
             else:
                 safe_set_full_optimizer_state(lp, value_tensor, key)
@@ -204,21 +208,25 @@ def set_param_values_with_dict(model, value_dict):
 def validate_param_values_with_dict(model, value_dict):
     for n, lp in model.named_parameters():
         for key, expected_tensor in value_dict[n].items():
-            if key == WEIGHT_KEY:
+            if key == GRADIENT_KEY:
+                actual_tensor = safe_get_full_grad(lp)
+            elif key == WEIGHT_KEY:
                 actual_tensor = safe_get_full_fp32_param(lp)
             else:
                 actual_tensor = safe_get_full_optimizer_state(lp, key)
+
             assert torch.equal(expected_tensor, actual_tensor)
 
 
-def create_random_values_for_local(model, key_list, group, use_cuda=True):
+def create_random_values_for_local(model, key_list, group, grad_dtype, use_cuda=True):
     param_values = {}
     for n, lp in model.named_parameters():
         param_shape = lp.ds_tensor.shape
         param_values[n] = {}
         for key in key_list:
             device = model.device if use_cuda else "cpu"
-            rand_value = torch.rand(param_shape, dtype=torch.float32, device=device)
+            dtype = grad_dtype if key == GRADIENT_KEY else torch.float32
+            rand_value = torch.rand(param_shape, dtype=dtype, device=device)
             # dist.broadcast(rand_value, src=0, group=group)
             param_values[n][key] = rand_value
     return param_values
@@ -228,7 +236,9 @@ def set_local_param_values_with_dict(model, value_dict):
     for n, lp in model.named_parameters():
 
         for key, value_tensor in value_dict[n].items():
-            if key == WEIGHT_KEY:
+            if key == GRADIENT_KEY:
+                safe_set_local_grad(lp, value_tensor)
+            elif key == WEIGHT_KEY:
                 safe_set_local_fp32_param(lp, value_tensor)
             else:
                 safe_set_local_optimizer_state(lp, value_tensor, key)
@@ -237,10 +247,13 @@ def set_local_param_values_with_dict(model, value_dict):
 def validate_local_param_values_with_dict(model, value_dict):
     for n, lp in model.named_parameters():
         for key, expected_tensor in value_dict[n].items():
-            if key == WEIGHT_KEY:
+            if key == GRADIENT_KEY:
+                actual_tensor = safe_get_local_grad(lp)
+            elif key == WEIGHT_KEY:
                 actual_tensor = safe_get_local_fp32_param(lp)
             else:
                 actual_tensor = safe_get_local_optimizer_state(lp, key)
+
             assert torch.equal(expected_tensor, actual_tensor)
 
 
@@ -325,12 +338,20 @@ def test_zero_fragments(self, tmpdir, api_type, zero_stage, offload_device, dtyp
 
         dist.barrier()
 
-        def validate_func(model):
-            optim_keys = [WEIGHT_KEY, FIRST_ORDER_KEY, SECOND_ORDER_KEY]
+        def after_bwd_validate_func(model):
+            state_keys = [WEIGHT_KEY, GRADIENT_KEY]
+            helper_funcs = helper_funcs_mapping[api_type]
+            optim_state_values = helper_funcs["create_random_values"](
+                model, state_keys, group, grad_dtype=dtype, use_cuda=offload_device == OffloadDeviceEnum.none)
+            helper_funcs["set_param_values_with_dict"](model, optim_state_values)
+            helper_funcs["validate_param_values_with_dict"](model, optim_state_values)
+
+        def after_step_validate_func(model):
+            state_keys = [WEIGHT_KEY, FIRST_ORDER_KEY, SECOND_ORDER_KEY]
             helper_funcs = helper_funcs_mapping[api_type]
             optim_state_values = helper_funcs["create_random_values"](
-                model, optim_keys, group, use_cuda=offload_device == OffloadDeviceEnum.none)
+                model, state_keys, group, grad_dtype=dtype, use_cuda=offload_device == OffloadDeviceEnum.none)
             helper_funcs["set_param_values_with_dict"](model, optim_state_values)
             helper_funcs["validate_param_values_with_dict"](model, optim_state_values)
 
-        run_fragmented_model(model, config_dict, hidden_dim, dtype, lambda _: None, validate_func)
+        run_fragmented_model(model, config_dict, hidden_dim, dtype, after_bwd_validate_func, after_step_validate_func)
diff --git a/version.txt b/version.txt
index 4312e0d0cae3a..1985d91413de8 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.15.2
+0.15.3