diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml
index ea8d2f5f806f8..6e8d5847835d0 100644
--- a/.github/workflows/amd-mi200.yml
+++ b/.github/workflows/amd-mi200.yml
@@ -48,8 +48,6 @@ jobs:
       - name: Install (ROCm) apex
         run: |
           git clone https://github.com/ROCmSoftwarePlatform/apex.git
-          cd apex
-          git checkout torch_2.1_higher
           CURRENT_VER=$(git rev-parse HEAD)
           INSTALLED_VER=$(cat /blob/amd-apex/.venv_installed_version)
           if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then
diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
index fc0dac5de9a26..007313964f4aa 100644
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@@ -27,7 +27,7 @@ jobs:
     env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml
index 0de6832b37c13..0125fa50bc140 100644
--- a/.github/workflows/cpu-torch-latest.yml
+++ b/.github/workflows/cpu-torch-latest.yml
@@ -19,7 +19,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-24.04
 
     steps:
       - uses: actions/checkout@v4
diff --git a/.github/workflows/hpu-gaudi2-nightly.yml b/.github/workflows/hpu-gaudi2-nightly.yml
new file mode 100644
index 0000000000000..5c5caff1ebb04
--- /dev/null
+++ b/.github/workflows/hpu-gaudi2-nightly.yml
@@ -0,0 +1,85 @@
+name: hpu-gaudi2-nightly
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - ".github/workflows/hpu-gaudi2-nightly.yml"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  unit-tests:
+    # The type of runner that the job will run on
+    runs-on: [self-hosted, intel, gaudi2]
+    container:
+      image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+      ports:
+        - 80
+      options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
+
+    env:
+      PT_HPU_LAZY_MODE: 0
+      TORCHINDUCTOR_COMPILE_THREADS: 1
+      TEST_LIST: |
+        test_adamw.py
+        test_bf16.py
+        test_ds_config_dict.py
+        test_dynamic_loss_scale.py
+        test_latest_checkpoint.py
+        test_moe_checkpoint.py
+        test_multi_output_model.py
+        test_other_optimizer.py
+        test_pipe.py
+        test_pipeline.py
+        test_universal_checkpoint.py
+        test_zero_context_return.py
+        test_zero_leaf_module.py
+        test_zero_offloadpp.py
+        test_zero_tiled.py
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+
+      - name: Check container state
+        run: |
+          ldd --version
+          hl-smi -L
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          git rev-parse --short HEAD
+          pip install .
+
+      - name: Install deepspeed
+        run: |
+          pip install .[dev,autotuning]
+          ds_report
+
+      - name: Python environment
+        run: |
+          pip list
+
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE}
+          export TORCHINDUCTOR_COMPILE_THREADS=${TORCHINDUCTOR_COMPILE_THREADS}
+          TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}')
+          echo "TEST_LIST ${TEST_LIST}"
+          pytest --verbose unit/ -k "${TEST_LIST}"
diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml
index 4e9ceb32b6b16..9f1a9d973ca27 100644
--- a/.github/workflows/hpu-gaudi2.yml
+++ b/.github/workflows/hpu-gaudi2.yml
@@ -39,7 +39,7 @@ jobs:
     # The type of runner that the job will run on
     runs-on: [self-hosted, intel, gaudi2]
     container:
-      image: vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+      image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
       ports:
         - 80
       options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
diff --git a/.github/workflows/no-torch.yml b/.github/workflows/no-torch.yml
index eb3ac9b03161a..1a13c0f3f4f10 100644
--- a/.github/workflows/no-torch.yml
+++ b/.github/workflows/no-torch.yml
@@ -19,7 +19,7 @@ permissions:
 
 jobs:
   unit-tests:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-24.04
 
     steps:
       - uses: actions/checkout@v4
@@ -30,6 +30,7 @@ jobs:
       - name: Python environment
         run: |
           pip uninstall torch --yes
+          pip install setuptools
           pip list
 
       - name: Build deepspeed
diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
index 484948b28e34a..639f27498dd9d 100644
--- a/.github/workflows/nv-a6000.yml
+++ b/.github/workflows/nv-a6000.yml
@@ -23,7 +23,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:23.03-py3
+      image: nvcr.io/nvidia/pytorch:24.03-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -47,8 +47,6 @@ jobs:
       - name: Install deepspeed
         run: |
           python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
-          # Update packages included in the container that do not support pydantic 2+ to versions that do
-          python -m pip install thinc spacy confection --upgrade
           python -m pip install .[dev,1bit,autotuning,inf]
           ds_report
       - name: Python environment
@@ -58,8 +56,8 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.0" --cuda_ver="12"
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.0" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.3" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.3" --cuda_ver="12"
       - name: MII unit tests
         run: |
           BRANCH="main"
diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml
index 2ad336cac4edc..329a1060f5ebd 100644
--- a/.github/workflows/nv-ds-chat.yml
+++ b/.github/workflows/nv-ds-chat.yml
@@ -12,6 +12,7 @@ on:
         type: string
   pull_request:
     paths:
+      - ".github/workflows/nv-ds-chat.yml"
       - "deepspeed/runtime/zero/stage_1_and_2.py"
       - "deepspeed/runtime/zero/stage3.py"
       - "deepspeed/runtime/hybrid_engine.py"
@@ -42,6 +43,7 @@ jobs:
 
       - name: Install deepspeed
         run: |
+          pip install transformers==4.45.2
           pip install .[dev]
           ds_report
 
diff --git a/.github/workflows/nv-human-eval.yml b/.github/workflows/nv-human-eval.yml
index 3de878547d6e7..2ecdf218b96af 100644
--- a/.github/workflows/nv-human-eval.yml
+++ b/.github/workflows/nv-human-eval.yml
@@ -11,7 +11,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:23.03-py3
+      image: nvcr.io/nvidia/pytorch:24.03-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -50,4 +50,4 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.0" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.3" --cuda_ver="12"
diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
index 044c282ba1198..f92aa7edfdd51 100644
--- a/.github/workflows/nv-lightning-v100.yml
+++ b/.github/workflows/nv-lightning-v100.yml
@@ -22,7 +22,7 @@ jobs:
     runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
index 72ba8abbd95df..fc810bc190d0d 100644
--- a/.github/workflows/nv-pre-compile-ops.yml
+++ b/.github/workflows/nv-pre-compile-ops.yml
@@ -21,7 +21,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-24.04
     container:
       image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116
 
diff --git a/.github/workflows/nv-sd.yml b/.github/workflows/nv-sd.yml
index 0344c80451a68..af406075b868e 100644
--- a/.github/workflows/nv-sd.yml
+++ b/.github/workflows/nv-sd.yml
@@ -27,7 +27,7 @@ jobs:
   sd-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:23.03-py3
+      image: nvcr.io/nvidia/pytorch:24.03-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -53,8 +53,6 @@ jobs:
           pip install image-similarity-measures
           python -m pip install opencv-python==4.6.* --force-reinstall
           python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
-          # Update packages included in the container that do not support pydantic 2+ to versions that do
-          python -m pip install thinc spacy confection --upgrade
           python -m pip install .[dev,1bit,autotuning,sd]
           ds_report
       - name: Python environment
@@ -64,7 +62,7 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'stable_diffusion' -k "TestStableDiffusion" unit/ --torch_ver="2.0" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'stable_diffusion' -k "TestStableDiffusion" unit/ --torch_ver="2.3" --cuda_ver="12"
 
       - name: Open GitHub issue if weekly CI fails
         if: ${{ failure() && (github.event_name == 'schedule') }}
diff --git a/.github/workflows/nv-torch110-p40.yml b/.github/workflows/nv-torch110-p40.yml
index ed639aeb3b62c..31d7805db7bb2 100644
--- a/.github/workflows/nv-torch110-p40.yml
+++ b/.github/workflows/nv-torch110-p40.yml
@@ -20,7 +20,7 @@ jobs:
     env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
diff --git a/.github/workflows/nv-torch110-v100.yml b/.github/workflows/nv-torch110-v100.yml
index 4b9f278448ab7..bb1bc987379c3 100644
--- a/.github/workflows/nv-torch110-v100.yml
+++ b/.github/workflows/nv-torch110-v100.yml
@@ -20,7 +20,7 @@ jobs:
     env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 3103e3f36e84f..35f9502ecbc94 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -21,10 +21,10 @@ jobs:
   unit-tests:
     strategy:
       matrix:
-        pyVersion: ["3.7", "3.8", "3.9", "3.10"]
+        pyVersion: ["3.8", "3.9", "3.10"]
       fail-fast: false
 
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-24.04
     container:
       image: deepspeed/gh-builder:py${{ matrix.pyVersion }}
 
diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml
index adeeb0acade2f..d19e73aeef1ce 100644
--- a/.github/workflows/xpu-max1100.yml
+++ b/.github/workflows/xpu-max1100.yml
@@ -36,7 +36,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, intel, xpu]
     container:
-      image: intel/oneapi-basekit:2024.1.1-devel-ubuntu22.04
+      image: intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
       ports:
         - 80
       options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
@@ -47,12 +47,11 @@ jobs:
       run: |
         apt-get update
         apt-get install clinfo libaio-dev python3-pip -y
-        pip install torch==2.1.0.post2 -f https://developer.intel.com/ipex-whl-stable-xpu
-        pip install intel-extension-for-pytorch==2.1.30+xpu -f https://developer.intel.com/ipex-whl-stable-xpu
-        pip install intel-extension-for-pytorch-deepspeed==2.1.30 -f https://developer.intel.com/ipex-whl-stable-xpu
-        pip install oneccl_bind_pt==2.1.300+xpu -f https://developer.intel.com/ipex-whl-stable-xpu
-        pip install torchvision==0.16.0.post2 -f https://developer.intel.com/ipex-whl-stable-xpu
-        pip install py-cpuinfo numpy==1.26
+        pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/
+        pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/
+        pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/
+        pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/
+        pip install py-cpuinfo numpy
         pip install .[dev,autotuning]
 
     - name: Check container state
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b89c872eefe5e..b5d8afa8e0b41 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -23,7 +23,7 @@ repos:
     -   id: trailing-whitespace
 
 -   repo: https://github.com/google/yapf
-    rev: v0.32.0
+    rev: v0.40.0
     hooks:
     -   id: yapf
 
@@ -65,7 +65,7 @@ repos:
         ]
 
 -   repo: https://github.com/pycqa/flake8
-    rev: 4.0.1
+    rev: 5.0.4
     hooks:
     -   id: flake8
         args: ['--config=.flake8']
diff --git a/COMMITTERS.md b/COMMITTERS.md
new file mode 100644
index 0000000000000..bcb8579bf1f79
--- /dev/null
+++ b/COMMITTERS.md
@@ -0,0 +1,9 @@
+# DeepSpeed TSC Committers #
+
+| Name | GitHub ID | Affiliation
+|--- | ---- | --- |
+| Olatunji Ruwase | [tjruwase](https://github.com/tjruwase)     | Microsoft |
+| Logan Adams     | [loadams](https://github.com/loadams)      | Microsoft |
+| Masahiro Tanaka | [tohtana](https://github.com/tohtana)      | Microsoft |
+| Jeff Rasley     | [jeffra](https://github.com/jeffra)       | SnowFlake |
+| Minjia Zhang    | [minjiazhang](https://github.com/minjiazhang)  | UIUC      |
diff --git a/GOVERNANCE.md b/GOVERNANCE.md
new file mode 100644
index 0000000000000..d488ec55114e7
--- /dev/null
+++ b/GOVERNANCE.md
@@ -0,0 +1,101 @@
+
+# DeepSpeed Project Charter and Governance
+
+This charter sets forth the responsibilities and procedures for technical contribution to, and oversight of, the DeepSpeed open source project. All contributors (including committers, maintainers, and other technical positions) and other participants in the Project (collectively, "Collaborators") must comply with the terms of this Charter.
+
+## Mission and Scope of the Project
+
+The mission of the Project is to DeepSpeed is a deep learning optimization library that makes distributed training and inference easy, efficient, and effective.
+
+The scope of the Project includes collaborative development under the Project License (as defined herein) supporting the mission, including documentation, testing, integration, and the creation of other artifacts that aid the development, deployment, operation, or adoption of the open source project.
+
+## Technical Steering Committee
+
+1. The Technical Steering Committee (the "TSC") will be responsible for all technical oversight of the open source Project.
+
+2. The TSC voting members are initially the Project's Committers. At the inception of the project, the Committers of the Project will be as set forth within the "CONTRIBUTING" file within the Project's code repository. The TSC may choose an alternative approach for determining the voting members of the TSC, and any such alternative approach will be documented in the CONTRIBUTING file. Any meetings of the Technical Steering Committee are intended to be open to the public, and can be conducted electronically, via teleconference, or in person.
+
+3. TSC projects generally will involve Contributors and Committers. The TSC may adopt or modify roles so long as the roles are documented in the CONTRIBUTING file. Unless otherwise documented:
+
+	- **Contributors** include anyone in the technical community that contributes code, documentation, or other technical artifacts to the Project.
+	- **Committers** are Contributors who have earned the ability to modify ("commit") source code, documentation, or other technical artifacts in a project's repository.
+
+	-  A Contributor may become a Committer by a majority approval of the existing Committers. A Committer may be removed by a majority approval of the other existing Committers.
+
+4. Participation in the Project through becoming a Contributor and Committer is open to anyone so long as they abide by the terms of this Charter.
+
+5. The TSC may:
+	- Establish workflow procedures for the submission, approval, and closure/archiving of projects.
+	- Set requirements for the promotion of Contributors to Committer status, as applicable.
+	- Amend, adjust, refine and/or eliminate the roles of Contributors and Committers, and create new roles, and publicly document any TSC roles, as it sees fit.
+
+6. The TSC may elect a TSC Chair, who will preside over meetings of the TSC and will serve until their resignation or replacement by the TSC. The TSC Chair, or any other TSC member so designated by the TSC, will serve as the primary communication contact between the Project and AI & Data, a directed fund of The Linux Foundation.
+
+7. Responsibilities:  The TSC will be responsible for all aspects of oversight relating to the Project, which may include:
+
+	- Coordinating the technical direction of the Project.
+	- Approving project or system proposals (including, but not limited to, incubation, deprecation, and changes to a sub-project's scope).
+	- Organizing sub-projects and removing sub-projects.
+	- Creating sub-committees or working groups to focus on cross-project technical issues and requirements.
+	- Appointing representatives to work with other open source or open standards communities.
+	- Establishing community norms, workflows, issuing releases, and security issue reporting policies.
+	- Approving and implementing policies and processes for contributing (to be published in the CONTRIBUTING file) and coordinating with the series manager of the Project (as provided for in the Series Agreement, the "Series Manager") to resolve matters or concerns that may arise as set forth in Section 7 of this Charter.
+	- Discussions, seeking consensus, and where necessary, voting on technical matters relating to the code base that affect multiple projects.
+	- Coordinating any marketing, events, or communications regarding the Project.
+
+## TSC Voting
+
+1. While the Project aims to operate as a consensus-based community, if any TSC decision requires a vote to move the Project forward, the voting members of the TSC will vote on a one vote per voting member basis.
+
+2. Quorum for TSC meetings requires at least fifty percent of all voting members of the TSC to be present. The TSC may continue to meet if quorum is not met but will be prevented from making any decisions at the meeting.
+
+3. Except as provided in Section 7.c. and 8.a, decisions by vote at a meeting require a majority vote of those in attendance, provided quorum is met. Decisions made by electronic vote without a meeting require a majority vote of all voting members of the TSC.
+
+4. In the event a vote cannot be resolved by the TSC, any voting member of the TSC may refer the matter to the Series Manager for assistance in reaching a resolution.
+
+## Compliance with Policies
+
+1. This Charter is subject to the Series Agreement for the Project and the Operating Agreement of LF Projects. Contributors will comply with the policies of LF Projects as may be adopted and amended by LF Projects, including, without limitation, the policies listed at https://lfprojects.org/policies/.
+
+2. The TSC may adopt a code of conduct ("CoC") for the Project, which is subject to approval by the Series Manager. In the event that a Project-specific CoC has not been approved, the LF Projects Code of Conduct listed at https://lfprojects.org/policies will apply for all Collaborators in the Project.
+
+3. When amending or adopting any policy applicable to the Project, LF Projects will publish such policy, as to be amended or adopted, on its website at least 30 days prior to such policy taking effect; provided, however, that in the case of any amendment of the Trademark Policy or Terms of Use of LF Projects, any such amendment is effective upon publication on LF Project's website.
+
+4. All Collaborators must allow open participation from any individual or organization meeting the requirements for contributing under this Charter and any policies adopted for all Collaborators by the TSC, regardless of competitive interests. Put another way, the Project community must not seek to exclude any participant based on any criteria, requirement, or reason other than those that are reasonable and applied on a non-discriminatory basis to all Collaborators in the Project community.
+
+5. The Project will operate in a transparent, open, collaborative, and ethical manner at all times. The output of all Project discussions, proposals, timelines, decisions, and status should be made open and easily visible to all. Any potential violations of this requirement should be reported immediately to the Series Manager.
+
+## Community Assets
+
+1. LF Projects will hold title to all trade or service marks used by the Project ("Project Trademarks"), whether based on common law or registered rights. Project Trademarks will be transferred and assigned to LF Projects to hold on behalf of the Project. Any use of any Project Trademarks by Collaborators in the Project will be in accordance with the license from LF Projects and inure to the benefit of LF Projects.
+
+2. The Project will, as permitted and in accordance with such license from LF Projects, develop and own all Project GitHub and social media accounts, and domain name registrations created by the Project community.
+
+3. Under no circumstances will LF Projects be expected or required to undertake any action on behalf of the Project that is inconsistent with the tax-exempt status or purpose, as applicable, of the Joint Development Foundation or LF Projects, LLC.
+
+## General Rules and Operations
+
+The Project will:
+
+1. Engage in the work of the Project in a professional manner consistent with maintaining a cohesive community, while also maintaining the goodwill and esteem of LF Projects, Joint Development Foundation, and other partner organizations in the open source community.
+2. Respect the rights of all trademark owners, including any branding and trademark usage guidelines.
+
+## Intellectual Property Policy
+
+1. Collaborators acknowledge that the copyright in all new contributions will be retained by the copyright holder as independent works of authorship and that no contributor or copyright holder will be required to assign copyrights to the Project.
+
+2. Except as described in Section 7.c., all contributions to the Project are subject to the following:
+
+    - All new inbound code contributions to the Project must be made using Apache License, Version 2.0 available at http://www.apache.org/licenses/LICENSE-2.0 (the "Project License").
+	- All new inbound code contributions must also be accompanied by a Developer Certificate of Origin (http://developercertificate.org) sign-off in the source code system that is submitted through a TSC-approved contribution process which will bind the authorized contributor and, if not self-employed, their employer to the applicable license.
+	- All outbound code will be made available under the Project License.
+	- Documentation will be received and made available by the Project under the Creative Commons Attribution 4.0 International License (available at http://creativecommons.org/licenses/by/4.0/).
+	- The Project may seek to integrate and contribute back to other open source projects ("Upstream Projects"). In such cases, the Project will conform to all license requirements of the Upstream Projects, including dependencies, leveraged by the Project. Upstream Project code contributions not stored within the Project's main code repository will comply with the contribution process and license terms for the applicable Upstream Project.
+
+3. The TSC may approve the use of an alternative license or licenses for inbound or outbound contributions on an exception basis. To request an exception, please describe the contribution, the alternative open source license(s), and the justification for using an alternative open source license for the Project. License exceptions must be approved by a two-thirds vote of the entire TSC.
+
+4. Contributed files should contain license information, such as SPDX short form identifiers, indicating the open source license or licenses pertaining to the file.
+
+## Amendments
+
+1. This charter may be amended by a two-thirds vote of the entire TSC and is subject to approval by LF Projects.
diff --git a/README.md b/README.md
index b302e32dfd9ca..8bfc344a4edc7 100755
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
 ## Latest News
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
-
+* [2024/12] [DeepSpeed-Domino: Communication-Free LLM Training Engine](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-domino/README.md)
 * [2024/08] [DeepSpeed on Windows](https://github.com/microsoft/DeepSpeed/tree/master/blogs/windows/08-2024/README.md) [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/windows/08-2024/japanese/README.md)]
 * [2024/08] [DeepNVMe: Improving DL Applications through I/O Optimizations](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-gds/README.md) [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-gds/japanese/README.md)]
 * [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md) [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)]
@@ -121,7 +121,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram
 
 |                                                                                                | Documentation                                |
 | ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
-<img src="docs/assets/images/transformers-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/transformers-dark.png#gh-dark-mode-only" width="250px"> | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/main/main_classes/deepspeed) |
+<img src="docs/assets/images/transformers-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/transformers-dark.png#gh-dark-mode-only" width="250px"> | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/deepspeed) |
 | <img src="docs/assets/images/accelerate-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/accelerate-dark.png#gh-dark-mode-only" width="250px"> | [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) |
 | <img src="docs/assets/images/lightning-light.svg#gh-light-mode-only" width="200px"><img src="docs/assets/images/lightning-dark.svg#gh-dark-mode-only" width="200px"> | [Lightning with DeepSpeed](https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#deepspeed) |
 | <img src="docs/assets/images/mosaicml.svg" width="200px"> | [MosaicML with DeepSpeed](https://docs.mosaicml.com/projects/composer/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) |
@@ -142,7 +142,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram
 | PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
 | Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) |
 | Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) |
-| Huawei Ascend NPU | [![Huawei Ascend NPU](https://github.com/cosdt/DeepSpeed/actions/workflows/huawei-ascend-npu.yml/badge.svg?branch=master)](https://github.com/cosdt/DeepSpeed/actions/workflows/huawei-ascend-npu.yml) |
+| Huawei Ascend NPU | [![Huawei Ascend NPU](https://github.com/Ascend/Ascend-CI/actions/workflows/deepspeed.yaml/badge.svg?branch=main)](https://github.com/Ascend/Ascend-CI/actions/workflows/deepspeed.yaml) |
 
 # Installation
 
diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
index 1e4335b192928..0e49bd9f64581 100644
--- a/accelerator/cpu_accelerator.py
+++ b/accelerator/cpu_accelerator.py
@@ -71,6 +71,8 @@ def device_count(self):
             # In flat mode, HBM is in separate NUMA node with no cores on this node.
             # Ignore these NUMA nodes with no cores.
             numa_core_lists = get_numa_cores()
+            if not numa_core_lists:
+                return 1
             numa_count = 0
             prev_core_list = []
             for core_list in numa_core_lists:
diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py
index 6da48000dafa4..ad8a10710bf2c 100644
--- a/accelerator/xpu_accelerator.py
+++ b/accelerator/xpu_accelerator.py
@@ -26,8 +26,8 @@ def is_synchronized_device(self):
         return False
 
     def use_host_timers(self):
-        # WA XPU event will be consolidated in 2.5
-        if ipex.__version__ < '2.5':
+        # WA XPU event will be consolidated in 2.6
+        if ipex.__version__ < '2.6':
             return True
         else:
             return self.is_synchronized_device()
diff --git a/blogs/deepspeed-chat/japanese/README.md b/blogs/deepspeed-chat/japanese/README.md
index 23ead99f5c8d2..3ec570a9ea407 100644
--- a/blogs/deepspeed-chat/japanese/README.md
+++ b/blogs/deepspeed-chat/japanese/README.md
@@ -332,7 +332,7 @@ DeepSpeedは、以下のような機能を提供します。
 
 DeepSpeedは、Microsoftの[AI at Scale initiative](https://www.microsoft.com/en-us/research/project/ai-at-scale/)の一部で、次世代AIの機能の大規模な実現を進めています。詳細は[こちら](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)をご覧ください。DeepSpeedは、[Megatron-Turing NLG (530B)](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/), [Jurassic-1 (178B)](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf), [BLOOM (176B)](https://huggingface.co/blog/bloom-megatron-deepspeed), [GLM (130B)](https://github.com/THUDM/GLM-130B), [YaLM (100B)](https://github.com/yandex/YaLM-100B) を含め、様々な大規模モデルを学習するのに使用されてきました。
 
-またDeepSpeedは、 [Hugging Face Transformers](https://huggingface.co/docs/transformers/main/main_classes/deepspeed), [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/usage_guides/deepspeed), [PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html), [MosaicML Composer](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration), [Determined AI](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) など、多くの著名なオープンソースの深層学習フレームワークのバックエンドとして利用されています。
+またDeepSpeedは、 [Hugging Face Transformers](https://huggingface.co/docs/transformers/deepspeed), [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/usage_guides/deepspeed), [PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html), [MosaicML Composer](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration), [Determined AI](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) など、多くの著名なオープンソースの深層学習フレームワークのバックエンドとして利用されています。
 
 DeepSpeedについてのより詳しい情報は、以下をご覧ください。
 
diff --git a/blogs/deepspeed-domino/README.md b/blogs/deepspeed-domino/README.md
new file mode 100644
index 0000000000000..7dfdc7dac1c09
--- /dev/null
+++ b/blogs/deepspeed-domino/README.md
@@ -0,0 +1,199 @@
+<p align="center">
+  <img  width="300" height="300" src="./images/domino-logo.png" alt="domino logo"/>
+</p>
+
+<div align="center">
+
+# Domino: Communication-Free LLM Training Engine
+
+</div>
+
+<div align="center">
+  <img src="./images/domino-hero.png" alt="" width="1200" />
+
+<div align="left">
+
+*Figure 1: Project Domino is Microsoft DeepSpeed's Tensor Parallel (TP) Training Engine, which provides a uniform solution for both single-node and **multi-node** cases. Domino scales up traditional single-node-only TP solution to multi-node environments via **near-complete communication hiding** behind computation.*
+
+</div>
+</div><br>
+
+# Table of Content
+1. [Introduction](#introduction)
+2. [Domino Highlights](#domino-highlights)
+3. [Design Motivation](#design-motivation)
+4. [Domino Design](#domino-design)
+5. [Implementation and Optimization](#implementation-and-optimization)
+6. [Getting Started: Try out DeepSpeed-Domino](#getting-started-try-out-deepspeed-domino)
+7. [Citation](#citation)
+8. [Acknowledgements](#acknowledgements)
+
+
+# Introduction
+
+Generative AI (GenAI) has enabled transformative applications in a wide variety of domains, including chatbot, text summarization, and high-quality image and video generation. These capabilities are built on top of large foundation models, particularly Large Language Models (LLMs). LLMs are typically based on the [Transformer](https://arxiv.org/abs/1706.03762) network architecture, and include popular model families such as GPT and Llama. LLMs have grown beyond the memory capacity of a single accelerator (e.g., GPU), and so inferencing or training them requires distributed processing using multiple GPUs or even multiple nodes.
+
+Tensor parallelism (TP) is a popular distributed technique for training LLMs. TP leverages the aggregate memory of multiple GPUs to fit LLMs by partitioning each model layer across the GPUs. However, TP incurs two communication collective operations for each partitioned layer, separately for the forward and backward passes. TP is appealing due to its excellent system efficiency in single-node cases, where GPUs are directly connected via high bandwidth links like NVLink and NVSwitch. However, TP falls short in multi-node cases due to the lower bandwidth of cross-node interconnects. [Prior work](https://arxiv.org/abs/2406.06858) reports that communication can take up to 75\% of end-to-end training time. Figure 2 shows that even on the latest DGX-H100 nodes interconnected with high-end Infiniband of 400GB/s bandwidth, communication overheads remains as high as 43\% of end-to-end training iteration time. Recent advances in GeMM+NCCL kernel fusion are unable to fully hide communication overheads due to their limited scope of computation-communication overlapping. The trend of faster compute in newer GPUs (e.g., DGX-B200) indicates that the communication overheads of TP will be more pronounced in both single node and multiple node scenarios.
+
+<div align="center">
+  <img src="./images/gpt3-scale.png" width="400"><br>
+
+  *Figure 2: TP communication overhead in GPT-3-13B training using 1,2,4 DGX-H100 nodes (i.e., 8, 16, 32 H100 GPUs).*
+
+</div>
+
+# Domino Highlights
+
+
+* Domino is TP optimization technique that achieves **Near-Complete** communication hiding behind computation by decomposing a single batch training iteration into smaller and independent pieces, allowing efficient pipelining.
+
+Domino is the first work that provides a **uniform** Tensor Parallelism (TP) solution for both single-node and **multi-node** cases. Traditional TP solutions (e.g., Megatron-LM) fall short in multi-node cases due to limited cross-node communication bandwidth.
+
+### Performance
+
+We tested Domino on 1 to 4 DGX-H100 boxes (8xH100 per box). Each node has intra-node NVLink bandwidth of 900GB/s and inter-node IB bandwidth of 400GB/s. We oberved the following performance results:
+1. For both GPT and Llama model series, Domino outperforms Megatron-LM by up to **1.3x** and **1.2x** respectively in end-to-end training iteration throughput for different model sizes, sequence lengths and batch sizes. These results are summarized in Figure 1.
+2. For several cases, Domino achieves **near-optimal** training throughput, where optimal throughput refers to the throughput achieved assuming the communication collectives of TP are disabled.
+
+For more detailed performance results, please refer to our [arxiv paper](https://arxiv.org/abs/2409.15241).
+
+# Design Motivation
+
+In this section, we briefly discuss three topics. First, we motivate why the time is right is for a uniform TP solution for both single node and multi-node cases. Next, we analyze the communication overhead on latest Nvidia DGX-H100 boxes with high cross-node communication interconnects. Finally, we describe TP's sequential data dependency which causing communication stands out.
+
+### It is time for a uniform TP for single and multi-node scenarios
+
+Nvidia is pushing hard on breaking communication bandwidth gap between intra-node (i.e., GPUs within a node connected with NVLink) and inter-node (i.e., cross-node connected with Infini-Band(IB)). For example, each DGX-H100 is equipped with eight ConnectX-7 network cards and gets aggregated cross-node bandwidth of 400GB/s, which is at same level of intra-node NVLink (900GB/s). Therefore, it is time for proposing a uniform solution for both single node and multi-node TP training.
+
+### Communication Overhead in TP
+
+As described in [Megatron-LM paper](https://arxiv.org/pdf/1909.08053), for TP, every transformer block (i.e.,1 Self-Attention layer + 1 MLP layer) incurs 4 AllReduce calls, two in forward pass and two in the backward pass (shown in Figure 3). Given a LLM consisting of $N$ stacked transformer blocks, the number of AllReduce calls required for TP training is $4 * N$. Even for small models like GPT-3 2.7B or 6.7B which consists of 32 layers, the total number of AllReduce calls is 128 for every training iteration. For larger models, the number of AllReduce calls grows linearly with number of layers.
+
+<div align="center">
+  <img src="./images/tp-ar.png" width="500"><br>
+
+  *Figure 3: TP communication = 4 x AllReduce x num\_transformer\_block*
+
+</div>
+
+One big issue for TP is that the *communication resides on critical path of every input batch training execution* due to sequential data dependency we described in the following [TP data dependency analysis](#tp-data-dependency-analysis) section. Therefore, the communication overhead stands out and is difficult to hide behind computation. In Figure 4, we provide our communication overhead measurement using Megatron-LM training GPT-3 and Llama-2 model series with different model sizes and batch sizes across 1 to 4 DGX-H100 nodes (i.e., 8 to 32 H100 GPUs). The communication overhead is up to **47\%** despite using latest Nvidia hardware DGX-H100 with 400GB/s cross-node bandwidth.
+
+<div align="center">
+  <img src="./images/tp-comm-overhead.png" width="600"><br>
+
+  *Figure 4: TP communication and computation ratio per training iteration time over different models and batch sizes using 1 to 4 DGX-H100 nodes.*
+
+</div>
+
+As Llama-3 405B model training takes 54 days on 16,000 H100 GPUs, the projected communication time can be up to around **25 days on 16,000 H100s**. This finding shows that, despite using latest high-bandwidth interconnects like NVLink/Infini-Band(IB), the communication overheads of TP remains a huge portion of end-to-end training time.
+
+### TP data dependency analysis
+
+In traditional TP, shown in Figure 5, a transformer layer (either Attn or MLP layer) computation can be abstracted into $X\*A\*B=Y$, where $X$ is input. For attention layer, $A$ is attention computation (e.g., multihead-attention) and $B$ is linear layer. For MLP layer, both $A$ and $B$ are linear layers. An AllReduce is conducted on $Y$ after computation. Due to **sequential data dependency on $Y$ between computation (i.e., $X\*A\*B=Y$) and communication (i.e., AllReduce($Y$)), AllReduce($Y$) completely stands out**, thus making TP not efficient in limited communication bandwidth scenarios.
+
+<div align="center">
+  <img src="./images/design-base.png" width="600"><br>
+<div align="left">
+
+  *Figure 5: TP Forward pass of single Self-Attention/MLP layer. (X is input, A is attention computation for Self-Attention layer and linear for MLP layer, B is linear for both Self-Attention and MLP layer. Y is X\*A\*B output)*
+
+</div>
+</div>
+
+
+# Domino Design
+
+Compared to Figure 5, Domino breaks data dependency of $X\*A\*B$ via [*Row-wise Split on Inputs X*](#row-wise-split-on-inputs-x), [*Column-wise Split on Weights B*](#column-wise-split-on-weights-b), as well as a [hybrid solution combining these two](#2d-split-on-both-x-and-b). After breaking computation into pieces, Domino pipelines computation and communication working on different independent pieces, thus achieving near-complete communication hiding behind computation. Domino's unique benefits are listed as follows:
+
+1. Comparing with GeMM+NCCL kernel fusion techniques, Domino breaks data dependency thus has a much wider range of computation kernel sequences to overlap with NCCL call. For example, Domino can overlap AllReduce not only to a single GeMM, but also extend overlapping scope to multiple GeMMs, LayerNorm, DropOut and more.
+2. Domino achieves near-complete communication hiding behind computation, thus also achieves near-optimal system throughput in certain cases. (Optimal throughput refers to end-to-end throughput that disables all communication in TP training.)
+3. Domino works at kernel scheduler level, any kernel optimizations or new kernels can be seamlessly integrated into Domino framework.
+4. Domino tensor partition scheme is simple and generic. It is easy for user side end-to-end correctness debugging when facing issues like overflow or weights/gradients errors.
+
+For the ease of illustration, we describe forward propagation only (since backward pass is just in reverse order), and we describe only splitting tensor into two chunks.
+
+## Row-wise split on Inputs X:
+
+Domino breaks Input X in row dimension (i.e. batch dimension).
+
+<div align="center">
+  <img src="./images/design-row.png" width="600"><br>
+
+  *Figure 6: Domino row-wise (batch-dim) split on inputs X.*
+
+</div>
+
+**Data Dependency**: Split inputs' batch dimension has no data dependency for both intra-layer and inter-layer cases. Therefore, we achieve both *intra-layer* (AllReduce($Y1$) and $X2\*A\*B$) and *inter-layer* (AllReduce($Y2$) and next-layer's $X1\*A\*B$) computation-communication overlapping. With this batch split on inputs, Domino can hide up to **100\%** communication behind computation.
+
+## Column-wise split on Weights B:
+
+Domino breaks weight matrix B in column dimension.
+
+
+<div align="center">
+  <img src="./images/design-column.png" width="600"><br>
+
+  *Figure 7: Domino column-wise (last-dim) split on weights B.*
+
+</div>
+
+**Data Dependency**: Split Weights B column-wise have no data dependency in intra-layer case but have data dependency in inter-layer case. Therefore, we only achieve *intra-layer*
+ (AllReduce($Y1$) and $X2\*A\*B$) computation-communication overlapping. This column-split on weights scheme remains essential, since row-wise input split only would lead to narrow shape tensors that hinder kernel computational efficiency. In practice, Domino achieves 50\% to 70\% communication hiding behind computation with weights B column-wise split.
+
+## 2D Split on both X and B:
+
+For extremely large LLMs, Domino splits both inputs X and weights B in row and column dimension, separately. This method is beneficial for model training requiring both low memory footprints and minimizing communication overheads.
+
+<div align="center">
+  <img src="./images/design-hybrid.png" width="600"><br>
+
+  *Figure 8: Domino 2D split on both inputs X and weights B.*
+
+</div>
+
+**Data Dependency**: This 2D split policy inherits synchronization at the end of each transformer layer due to column-wise split on weights B. Therefore, the 2D approach only achieves *intra-layer* computation-communication overlapping.
+
+# Implementation and Optimization
+
+For brevity, we summarize key implementation of row-wise input split. For more implementation details, please refer to our [arxiv paper](https://arxiv.org/abs/2409.15241).
+
+**Forward:** Figure 9 shows how we position and trigger NCCL calls in order to overlap with computation kernel sequences in forward propagation. We split batch into two chunks as $\mu$-batch0 and $\mu$-batch1. $\mu$-batch0 attention output as attn0 and MLP output as MLP0. $\mu$-batch1's attention output as attn1 and MLP output as MLP1. AllReduce(attn0) is overlapped with self-attention computation on $\mu$-batch1. For AllReduce(attn1), we group multiple $\mu$-batches' Dropout, Residual, LayerNorm computation-communication overlapping. This small kernel grouping not only enable complete hiding of AllReduce(attn1), but also provides proper overlapping space for AllReduce(MLP0) in the backward pass shown in Figure 10. For AllReduce(MLP0), we hide it behind $\mu$-batch1's MLP computation kernel sequence of GeMM + GeLU + GeMM. For AllReduce(MLP1), we hide it behind next layer's attention computation.
+
+<div align="center">
+  <img src="./images/implement-fwd.png" width="700"><br>
+
+  *Figure 9: Transformer block (i.e., 1 self-attn + 1 MLP) forward pass. Upper figure is vanila TP implementation, bottom is Domino implementation.*
+
+</div>
+
+**Backward:** Figure 10 shows a simple example of batch split in to two $\mu$-batches as $\mu$-batch0 and $\mu$-batch1. Besides similar overlapping strategy in the forward pass, we extend the scope of overlap communication with weights' gradient computation inside same $\mu$-batch (e.g., AllReduce(MLP1) partially overlaps with its own $\mu$-batch1 computation as the 3rd orange block from left). Each *grad matmul* includes two separate GeMM computation for inputs gradient and weights gradient. Therefore, we can extend overlapping scope by overlapping AllReduce(MLP1) with $\mu$-batch1's weights gradient computation.
+
+Backward is a bit more challenging because backward computation graph is automatically generated by torch.autograd(). To precisely control NCCL call triggering time, we implement a *no\_operation* module, which obtains communication handle during forward pass and retains it for use during backward pass. Our *no\_operation* module works seamlessly with torch.autograd(), and enable us precisely control NCCL start/end time without rewriting customized backward computation graph.
+
+<div align="center">
+  <img src="./images/implement-bwd.png" width="700"><br>
+
+  *Figure 10: Transformer block (i.e., 1 self-attn + 1 MLP) backward pass. Upper figure is vanila TP implementation, bottom is Domino implementation.*
+
+</div>
+
+**General kernel optimizations:** We adopt general kernel-level optimization techniques. For example, we use cudaGraph to squeeze idle/bubble time between adjacent compute kernels to reduce end-to-end latency. We use CUDA multi-stream to increase parallel execution. We also leverage torch.compile() to further improve our system efficiency.
+
+# Getting Started: Try out DeepSpeed-Domino
+
+To try out DeepSpeed-Domino, please refer to [Domino tutorial](https://github.com/microsoft/DeepSpeedExamples/blob/master/training/DeepSpeed-Domino/README.md) in our DeepSpeedExample repo.
+
+## Citation
+
+```
+@article{wang2024-deepspeed-domino,
+  title={{Domino: Eliminating Communication in LLM Training via Generic Tensor Slicing and Overlapping}},
+  author={Guanhua Wang and Chengming Zhang and Zheyu Shen and Ang Li and Olatunji Ruwase},
+  journal={arXiv preprint arXiv:2409.15241},
+  year={2024}
+}
+```
+
+## Acknowledgements
+
+This work is the result of a deep collaboration between Microsoft DeepSpeed and our academia partners from University of Maryland, University of Houston. The contributors include [Guanhua Wang](https://www.microsoft.com/en-us/research/people/guanhuawang/), [Hongwei Chen](https://github.com/hwchen2017) and [Olatunji Ruwase](https://www.microsoft.com/en-us/research/people/olruwase/) from Microsoft DeepSpeed Team, [Chengming Zhang](https://chengmingzh8.github.io/) from University of Houston, [Zheyu Shen](https://www.linkedin.com/in/zheyushen/) and [Ang Li](https://www.ang-li.com/) from University of Maryland.
diff --git a/blogs/deepspeed-domino/images/design-base.png b/blogs/deepspeed-domino/images/design-base.png
new file mode 100644
index 0000000000000..d347e9c2ba8b1
Binary files /dev/null and b/blogs/deepspeed-domino/images/design-base.png differ
diff --git a/blogs/deepspeed-domino/images/design-column.png b/blogs/deepspeed-domino/images/design-column.png
new file mode 100644
index 0000000000000..a99ad3c6b461e
Binary files /dev/null and b/blogs/deepspeed-domino/images/design-column.png differ
diff --git a/blogs/deepspeed-domino/images/design-hybrid.png b/blogs/deepspeed-domino/images/design-hybrid.png
new file mode 100644
index 0000000000000..302e3f95e8fc0
Binary files /dev/null and b/blogs/deepspeed-domino/images/design-hybrid.png differ
diff --git a/blogs/deepspeed-domino/images/design-row.png b/blogs/deepspeed-domino/images/design-row.png
new file mode 100644
index 0000000000000..551a54f4e651d
Binary files /dev/null and b/blogs/deepspeed-domino/images/design-row.png differ
diff --git a/blogs/deepspeed-domino/images/domino-hero.png b/blogs/deepspeed-domino/images/domino-hero.png
new file mode 100644
index 0000000000000..078b6472b42aa
Binary files /dev/null and b/blogs/deepspeed-domino/images/domino-hero.png differ
diff --git a/blogs/deepspeed-domino/images/domino-logo.png b/blogs/deepspeed-domino/images/domino-logo.png
new file mode 100644
index 0000000000000..58be0990b9448
Binary files /dev/null and b/blogs/deepspeed-domino/images/domino-logo.png differ
diff --git a/blogs/deepspeed-domino/images/gpt3-scale.png b/blogs/deepspeed-domino/images/gpt3-scale.png
new file mode 100644
index 0000000000000..611b2221a73c8
Binary files /dev/null and b/blogs/deepspeed-domino/images/gpt3-scale.png differ
diff --git a/blogs/deepspeed-domino/images/implement-bwd.png b/blogs/deepspeed-domino/images/implement-bwd.png
new file mode 100644
index 0000000000000..4b115222f3871
Binary files /dev/null and b/blogs/deepspeed-domino/images/implement-bwd.png differ
diff --git a/blogs/deepspeed-domino/images/implement-fwd.png b/blogs/deepspeed-domino/images/implement-fwd.png
new file mode 100644
index 0000000000000..51d3a73bae58c
Binary files /dev/null and b/blogs/deepspeed-domino/images/implement-fwd.png differ
diff --git a/blogs/deepspeed-domino/images/tp-ar.png b/blogs/deepspeed-domino/images/tp-ar.png
new file mode 100644
index 0000000000000..6dd01ccceed8c
Binary files /dev/null and b/blogs/deepspeed-domino/images/tp-ar.png differ
diff --git a/blogs/deepspeed-domino/images/tp-comm-overhead.png b/blogs/deepspeed-domino/images/tp-comm-overhead.png
new file mode 100644
index 0000000000000..947473ff5261c
Binary files /dev/null and b/blogs/deepspeed-domino/images/tp-comm-overhead.png differ
diff --git a/csrc/adagrad/cpu_adagrad.cpp b/csrc/adagrad/cpu_adagrad.cpp
index 5790e79e2bc2a..e276ad0856dd6 100644
--- a/csrc/adagrad/cpu_adagrad.cpp
+++ b/csrc/adagrad/cpu_adagrad.cpp
@@ -17,9 +17,9 @@ static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
 
 // C++ interface
 
-template <typename ds_params_percision_t, typename ds_state_precision_t>
-void Adagrad_Optimizer::Step_1(ds_params_percision_t* _params,
-                               ds_params_percision_t* grads,
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Adagrad_Optimizer::Step_1(ds_params_precision_t* _params,
+                               ds_params_precision_t* grads,
                                ds_state_precision_t* _exp_avg_sq,
                                size_t _param_size)
 {
@@ -56,9 +56,9 @@ void Adagrad_Optimizer::Step_1(ds_params_percision_t* _params,
     }
 }
 
-template <typename ds_params_percision_t, typename ds_state_precision_t>
-void Adagrad_Optimizer::Step_4(ds_params_percision_t* _params,
-                               ds_params_percision_t* grads,
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Adagrad_Optimizer::Step_4(ds_params_precision_t* _params,
+                               ds_params_precision_t* grads,
                                ds_state_precision_t* _exp_avg_sq,
                                size_t _param_size)
 {
@@ -104,9 +104,9 @@ int create_adagrad_optimizer(int optimizer_id,
     return 0;
 }
 
-template <typename ds_params_percision_t, typename ds_state_precision_t>
-void Adagrad_Optimizer::Step_8(ds_params_percision_t* _params,
-                               ds_params_percision_t* grads,
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Adagrad_Optimizer::Step_8(ds_params_precision_t* _params,
+                               ds_params_precision_t* grads,
                                ds_state_precision_t* _exp_avg_sq,
                                size_t _param_size)
 {
@@ -121,15 +121,15 @@ void Adagrad_Optimizer::Step_8(ds_params_percision_t* _params,
                (_param_size - rounded_size));
 }
 
-template <typename ds_params_percision_t, typename ds_state_precision_t>
+template <typename ds_params_precision_t, typename ds_state_precision_t>
 void step_invoker(std::shared_ptr<Adagrad_Optimizer> opt,
                   void* _params,
                   void* grads,
                   void* _exp_avg_sq,
                   size_t _param_size)
 {
-    opt->Step_8((ds_params_percision_t*)(_params),
-                (ds_params_percision_t*)(grads),
+    opt->Step_8((ds_params_precision_t*)(_params),
+                (ds_params_precision_t*)(grads),
                 (ds_state_precision_t*)(_exp_avg_sq),
                 _param_size);
 }
@@ -139,12 +139,12 @@ std::map<std::tuple<c10::ScalarType, c10::ScalarType>,
     invokers;
 
 // Fill map with template functions for each type
-template <class ds_params_percision_t, class ds_state_precision_t>
+template <class ds_params_precision_t, class ds_state_precision_t>
 void create_invoker()
 {
-    invokers[std::tuple(c10::CppTypeToScalarType<ds_params_percision_t>(),
+    invokers[std::tuple(c10::CppTypeToScalarType<ds_params_precision_t>(),
                         c10::CppTypeToScalarType<ds_state_precision_t>())] =
-        step_invoker<ds_params_percision_t, ds_state_precision_t>;
+        step_invoker<ds_params_precision_t, ds_state_precision_t>;
 }
 struct InvokerInitializer {
     InvokerInitializer()
diff --git a/csrc/adam/cpu_adam_impl.cpp b/csrc/adam/cpu_adam_impl.cpp
index 15d4e74d69d5f..465aae7b9a343 100644
--- a/csrc/adam/cpu_adam_impl.cpp
+++ b/csrc/adam/cpu_adam_impl.cpp
@@ -18,9 +18,9 @@ static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
 
 // C++ interface
 
-template <typename ds_params_percision_t, typename ds_state_precision_t>
-void Adam_Optimizer::Step_1(ds_params_percision_t* _params,
-                            ds_params_percision_t* grads,
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Adam_Optimizer::Step_1(ds_params_precision_t* _params,
+                            ds_params_precision_t* grads,
                             ds_state_precision_t* _exp_avg,
                             ds_state_precision_t* _exp_avg_sq,
                             size_t _param_size)
@@ -67,9 +67,9 @@ void Adam_Optimizer::Step_1(ds_params_percision_t* _params,
     }
 }
 
-template <typename ds_params_percision_t, typename ds_state_precision_t>
-void Adam_Optimizer::Step_4(ds_params_percision_t* _params,
-                            ds_params_percision_t* grads,
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Adam_Optimizer::Step_4(ds_params_precision_t* _params,
+                            ds_params_precision_t* grads,
                             ds_state_precision_t* _exp_avg,
                             ds_state_precision_t* _exp_avg_sq,
                             size_t _param_size)
@@ -126,9 +126,9 @@ int create_adam_optimizer(int optimizer_id,
     return 0;
 }
 
-template <typename ds_params_percision_t, typename ds_state_precision_t>
-void Adam_Optimizer::Step_8(ds_params_percision_t* _params,
-                            ds_params_percision_t* grads,
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Adam_Optimizer::Step_8(ds_params_precision_t* _params,
+                            ds_params_precision_t* grads,
                             ds_state_precision_t* _exp_avg,
                             ds_state_precision_t* _exp_avg_sq,
                             size_t _param_size)
@@ -145,7 +145,7 @@ void Adam_Optimizer::Step_8(ds_params_percision_t* _params,
                (_param_size - rounded_size));
 }
 
-template <typename ds_params_percision_t, typename ds_state_precision_t>
+template <typename ds_params_precision_t, typename ds_state_precision_t>
 void step_invoker(std::shared_ptr<Adam_Optimizer> opt,
                   void* _params,
                   void* grads,
@@ -153,8 +153,8 @@ void step_invoker(std::shared_ptr<Adam_Optimizer> opt,
                   void* _exp_avg_sq,
                   size_t _param_size)
 {
-    opt->Step_8((ds_params_percision_t*)(_params),
-                (ds_params_percision_t*)(grads),
+    opt->Step_8((ds_params_precision_t*)(_params),
+                (ds_params_precision_t*)(grads),
                 (ds_state_precision_t*)(_exp_avg),
                 (ds_state_precision_t*)(_exp_avg_sq),
                 _param_size);
@@ -165,12 +165,12 @@ std::map<std::tuple<c10::ScalarType, c10::ScalarType>,
     invokers;
 
 // Fill map with template functions for each type
-template <class ds_params_percision_t, class ds_state_precision_t>
+template <class ds_params_precision_t, class ds_state_precision_t>
 void create_invoker()
 {
-    invokers[std::tuple(c10::CppTypeToScalarType<ds_params_percision_t>(),
+    invokers[std::tuple(c10::CppTypeToScalarType<ds_params_precision_t>(),
                         c10::CppTypeToScalarType<ds_state_precision_t>())] =
-        step_invoker<ds_params_percision_t, ds_state_precision_t>;
+        step_invoker<ds_params_precision_t, ds_state_precision_t>;
 }
 struct InvokerInitializer {
     InvokerInitializer()
diff --git a/csrc/aio/common/deepspeed_aio_utils.cpp b/csrc/aio/common/deepspeed_aio_utils.cpp
index 0536ff6a362e7..fb269b58315fb 100644
--- a/csrc/aio/common/deepspeed_aio_utils.cpp
+++ b/csrc/aio/common/deepspeed_aio_utils.cpp
@@ -19,9 +19,14 @@ const int c_io_queue_depth = 8;
 
 io_xfer_ctxt::io_xfer_ctxt(const int fd,
                            const int64_t file_offset,
+                           const int64_t buffer_offset,
                            const int64_t num_bytes,
                            const void* buffer)
-    : _fd(fd), _base_offset(file_offset), _mem_buffer(buffer), _num_bytes(num_bytes)
+    : _fd(fd),
+      _file_base_offset(file_offset),
+      _buffer_base_offset(buffer_offset),
+      _mem_buffer(buffer),
+      _num_bytes(num_bytes)
 {
 }
 
@@ -41,9 +46,10 @@ void io_prep_context::prep_iocbs(const int n_iocbs,
     assert(static_cast<size_t>(n_iocbs) <= _iocbs->size());
     for (auto i = 0; i < n_iocbs; ++i) {
         const auto shift = i * _block_size;
-        const auto xfer_buffer = (char*)start_buffer + _xfer_ctxt->_base_offset + shift;
-        const auto xfer_offset = _xfer_ctxt->_base_offset + start_offset + shift;
+        const auto xfer_buffer = (char*)start_buffer + _xfer_ctxt->_buffer_base_offset + shift;
+        const auto xfer_offset = _xfer_ctxt->_file_base_offset + start_offset + shift;
         auto byte_count = _block_size;
+
         if ((shift + _block_size) > num_bytes) { byte_count = num_bytes - shift; }
 
         if (_read_op) {
@@ -79,10 +85,10 @@ int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>*
 
     auto actual_n_iocbs = min(static_cast<int64_t>(n_iocbs), _remaining_io_blocks);
     for (auto i = 0; i < actual_n_iocbs; ++i, ++_next_iocb_index) {
-        const auto xfer_offset = _xfer_ctxt->_base_offset + (_next_iocb_index * _block_size);
-        const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + xfer_offset;
+        const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + _xfer_ctxt->_buffer_base_offset +
+                                 (_next_iocb_index * _block_size);
+        const auto xfer_offset = _xfer_ctxt->_file_base_offset + (_next_iocb_index * _block_size);
         const auto num_bytes = min(static_cast<int64_t>(_block_size), _remaining_bytes);
-
         if (_read_op) {
             io_prep_pread(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
         } else {
diff --git a/csrc/aio/common/deepspeed_aio_utils.h b/csrc/aio/common/deepspeed_aio_utils.h
index 20e81fe8eebd7..6b7599acecb44 100644
--- a/csrc/aio/common/deepspeed_aio_utils.h
+++ b/csrc/aio/common/deepspeed_aio_utils.h
@@ -30,12 +30,14 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 struct io_xfer_ctxt {
     const int _fd;
-    const int64_t _base_offset;
+    const int64_t _file_base_offset;
+    const int64_t _buffer_base_offset;
     const void* _mem_buffer;
     const int64_t _num_bytes;
 
     io_xfer_ctxt(const int fd,
                  const int64_t file_offset,
+                 const int64_t buffer_offset,
                  const int64_t num_bytes,
                  const void* buffer);
 };
diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
index 6f311c5400c72..9452513972257 100644
--- a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
@@ -13,14 +13,16 @@ io_op_desc_t::io_op_desc_t(const bool read_op,
                            const char* filename,
                            const int64_t file_num_bytes,
                            const int intra_op_parallelism,
-                           const bool validate)
+                           const bool validate,
+                           const int64_t file_offset)
     : _read_op(read_op),
       _buffer(buffer),
       _fd(fd),
       _filename(filename),
       _file_num_bytes(file_num_bytes),
+      _file_offset(file_offset),
       _intra_op_parallelism(intra_op_parallelism),
-      _num_bytes_per_thread(file_num_bytes / intra_op_parallelism),
+      _num_bytes_per_thread(static_cast<int64_t>(buffer.nbytes()) / intra_op_parallelism),
       _validate(validate)
 {
 }
diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.h b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
index f841b8ce520a5..ac1cdf90f78be 100644
--- a/csrc/aio/py_lib/deepspeed_aio_op_desc.h
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
@@ -19,6 +19,7 @@ struct io_op_desc_t {
     const int64_t _num_bytes_per_thread;
     torch::Tensor _contiguous_buffer;
     const bool _validate;
+    const int64_t _file_offset;
 
     io_op_desc_t(const bool read_op,
                  const torch::Tensor& buffer,
@@ -26,7 +27,8 @@ struct io_op_desc_t {
                  const char* filename,
                  const int64_t file_num_bytes,
                  const int intra_op_parallelism,
-                 const bool validate);
+                 const bool validate,
+                 const int64_t file_offset);
 
     virtual void run(const int tid,
                      std::unique_ptr<aio_context>& aio_ctxt,
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
index da2ff568d74bf..56fb33fb18869 100644
--- a/csrc/aio/py_lib/deepspeed_cpu_op.cpp
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
@@ -16,8 +16,16 @@ cpu_op_desc_t::cpu_op_desc_t(
     const char* filename,
     const int64_t file_num_bytes,
     const int intra_op_parallelism,
-    const bool validate)
-    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, intra_op_parallelism, validate),
+    const bool validate,
+    const int64_t file_offset)
+    : io_op_desc_t(read_op,
+                   buffer,
+                   fd,
+                   filename,
+                   file_num_bytes,
+                   intra_op_parallelism,
+                   validate,
+                   file_offset),
       _cpu_buffer(buffer),
       _pinned_tensor_mgr(pinned_tensor_mgr),
       _is_managed_bounce_buffer(false)
@@ -66,10 +74,11 @@ void cpu_op_desc_t::run(const int tid,
                         deepspeed_aio_config_t* aio_config)
 {
     assert(tid < _intra_op_parallelism);
-    const auto base_offset = _num_bytes_per_thread * tid;
+    const auto buffer_base_offset = _num_bytes_per_thread * tid;
+    const auto file_base_offset = _file_offset + (_num_bytes_per_thread * tid);
 
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
-        new io_xfer_ctxt(_fd, base_offset, _num_bytes_per_thread, data_ptr()));
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(
+        _fd, file_base_offset, buffer_base_offset, _num_bytes_per_thread, data_ptr()));
 
     if (aio_config->_overlap_events) {
         do_aio_operation_overlap(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr);
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.h b/csrc/aio/py_lib/deepspeed_cpu_op.h
index 9de2fa2540486..debaf4a90731f 100644
--- a/csrc/aio/py_lib/deepspeed_cpu_op.h
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.h
@@ -20,7 +20,8 @@ struct cpu_op_desc_t : io_op_desc_t {
                   const char* filename,
                   const int64_t file_num_bytes,
                   const int intra_op_parallelism,
-                  const bool validate);
+                  const bool validate,
+                  const int64_t file_offset);
 
     void run(const int tid,
              std::unique_ptr<aio_context>& aio_ctxt,
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp
index 02b04057d1ac3..1ff0397043fae 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@@ -52,7 +52,9 @@ int deepspeed_py_aio_write(const torch::Tensor& buffer,
 
     auto write_buffer = (char*)buffer.data_ptr();
     const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
+
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
+        new io_xfer_ctxt(fd, 0, 0, num_write_bytes, write_buffer));
     std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
 
     if (config._overlap_events) {
@@ -97,7 +99,8 @@ int deepspeed_py_aio_read(torch::Tensor& buffer,
     auto read_buffer = (char*)buffer.data_ptr();
     assert(static_cast<int64_t>(buffer.nbytes()) == num_file_bytes);
 
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
+        new io_xfer_ctxt(fd, 0, 0, num_file_bytes, read_buffer));
     std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
 
     if (config._overlap_events) {
diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
index 48ea8a1339d45..64d7c2e0541e1 100644
--- a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
@@ -58,7 +58,10 @@ const bool deepspeed_io_handle_t::get_overlap_events() const { return _overlap_e
 
 const int deepspeed_io_handle_t::get_intra_op_parallelism() const { return _intra_op_parallelism; }
 
-int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
+int deepspeed_io_handle_t::read(torch::Tensor& buffer,
+                                const char* filename,
+                                const bool validate,
+                                const int64_t file_offset)
 {
     const auto start_time = std::chrono::high_resolution_clock::now();
 
@@ -76,7 +79,8 @@ int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, con
     if (fd == -1) { return -1; }
 
     auto read_buffer = (char*)buffer.data_ptr();
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
+        new io_xfer_ctxt(fd, file_offset, 0, num_file_bytes, read_buffer));
 
     if (_aio_config._overlap_events) {
         do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
@@ -98,7 +102,8 @@ int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, con
 
 int deepspeed_io_handle_t::write(const torch::Tensor& buffer,
                                  const char* filename,
-                                 const bool validate)
+                                 const bool validate,
+                                 const int64_t file_offset)
 {
     assert(_aio_ctxt);
 
@@ -109,7 +114,8 @@ int deepspeed_io_handle_t::write(const torch::Tensor& buffer,
 
     auto write_buffer = (char*)buffer.data_ptr();
     const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
+        new io_xfer_ctxt(fd, file_offset, 0, num_write_bytes, write_buffer));
 
     if (_aio_config._overlap_events) {
         do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
@@ -206,7 +212,8 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_io_handle_t::_create_io_op_desc(
     const int fd,
     const char* filename,
     const int64_t file_num_bytes,
-    const bool validate)
+    const bool validate,
+    const int64_t file_offset)
 {
     return std::make_shared<cpu_op_desc_t>(read_op,
                                            buffer,
@@ -215,13 +222,15 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_io_handle_t::_create_io_op_desc(
                                            filename,
                                            file_num_bytes,
                                            _intra_op_parallelism,
-                                           validate);
+                                           validate,
+                                           file_offset);
 }
 
 int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
                                  const char* filename,
                                  const bool validate,
-                                 const bool async)
+                                 const bool async,
+                                 const int64_t file_offset)
 {
     int64_t num_file_bytes;
     if (-1 == get_file_size(filename, num_file_bytes)) {
@@ -229,20 +238,18 @@ int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
         report_file_error(filename, " fstat for read", error_code);
         return -1;
     }
+
+    // buffer can exceed file size to enable 4k alignment
     const auto buffer_bytes = static_cast<int64_t>(buffer.nbytes());
-    if (buffer_bytes != num_file_bytes) {
-        std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
-                  << " != " << num_file_bytes << std::endl;
-    }
-    assert(buffer_bytes == num_file_bytes);
     assert((num_file_bytes % _intra_op_parallelism) == 0);
 
-    if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
+    if (!_is_valid_parallel_aio_op(true, buffer_bytes)) { return -1; }
 
     const auto fd = open_file(filename, true);
     if (fd == -1) { return -1; }
 
-    auto scheduled_op = _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, validate);
+    auto scheduled_op =
+        _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, validate, file_offset);
 
     _schedule_aio_work(scheduled_op);
 
@@ -254,7 +261,8 @@ int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
 int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer,
                                   const char* filename,
                                   const bool validate,
-                                  const bool async)
+                                  const bool async,
+                                  const int64_t file_offset)
 {
     const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
     assert((num_write_bytes % _intra_op_parallelism) == 0);
@@ -264,7 +272,8 @@ int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer,
     const auto fd = open_file(filename, false);
     if (fd == -1) { return -1; }
 
-    auto scheduled_op = _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, validate);
+    auto scheduled_op =
+        _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, validate, file_offset);
 
     _schedule_aio_work(scheduled_op);
 
@@ -273,24 +282,32 @@ int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer,
     return wait();
 }
 
-int deepspeed_io_handle_t::sync_pread(torch::Tensor& buffer, const char* filename)
+int deepspeed_io_handle_t::sync_pread(torch::Tensor& buffer,
+                                      const char* filename,
+                                      const int64_t file_offset)
 {
-    return pread(buffer, filename, false, false);
+    return pread(buffer, filename, false, false, file_offset);
 }
 
-int deepspeed_io_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename)
+int deepspeed_io_handle_t::sync_pwrite(const torch::Tensor& buffer,
+                                       const char* filename,
+                                       const int64_t file_offset)
 {
-    return pwrite(buffer, filename, false, false);
+    return pwrite(buffer, filename, false, false, file_offset);
 }
 
-int deepspeed_io_handle_t::async_pread(torch::Tensor& buffer, const char* filename)
+int deepspeed_io_handle_t::async_pread(torch::Tensor& buffer,
+                                       const char* filename,
+                                       const int64_t file_offset)
 {
-    return pread(buffer, filename, false, true);
+    return pread(buffer, filename, false, true, file_offset);
 }
 
-int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename)
+int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer,
+                                        const char* filename,
+                                        const int64_t file_offset)
 {
-    return pwrite(buffer, filename, false, true);
+    return pwrite(buffer, filename, false, true, file_offset);
 }
 
 at::Tensor deepspeed_io_handle_t::new_cpu_locked_tensor(const int64_t num_elem,
diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.h b/csrc/aio/py_lib/deepspeed_py_io_handle.h
index 4fedf80808189..dfcb4125ab9ad 100644
--- a/csrc/aio/py_lib/deepspeed_py_io_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.h
@@ -38,27 +38,35 @@ struct deepspeed_io_handle_t {
     const bool get_overlap_events() const;
     const int get_intra_op_parallelism() const;
 
-    int read(torch::Tensor& buffer, const char* filename, const bool validate);
+    int read(torch::Tensor& buffer,
+             const char* filename,
+             const bool validate,
+             const int64_t file_offset);
 
-    int write(const torch::Tensor& buffer, const char* filename, const bool validate);
+    int write(const torch::Tensor& buffer,
+              const char* filename,
+              const bool validate,
+              const int64_t file_offset);
 
     int pread(const torch::Tensor& buffer,
               const char* filename,
               const bool validate,
-              const bool async);
+              const bool async,
+              const int64_t file_offset);
 
     int pwrite(const torch::Tensor& buffer,
                const char* filename,
                const bool validate,
-               const bool async);
+               const bool async,
+               const int64_t file_offset);
 
-    int sync_pread(torch::Tensor& buffer, const char* filename);
+    int sync_pread(torch::Tensor& buffer, const char* filename, const int64_t file_offset);
 
-    int sync_pwrite(const torch::Tensor& buffer, const char* filename);
+    int sync_pwrite(const torch::Tensor& buffer, const char* filename, const int64_t file_offset);
 
-    int async_pread(torch::Tensor& buffer, const char* filename);
+    int async_pread(torch::Tensor& buffer, const char* filename, const int64_t file_offset);
 
-    int async_pwrite(const torch::Tensor& buffer, const char* filename);
+    int async_pwrite(const torch::Tensor& buffer, const char* filename, const int64_t file_offset);
 
     // TODO: Make API's args to be shape and dtype.
     torch::Tensor new_cpu_locked_tensor(const int64_t num_elem,
@@ -81,5 +89,6 @@ struct deepspeed_io_handle_t {
                                                                     const int fd,
                                                                     const char* filename,
                                                                     const int64_t file_num_bytes,
-                                                                    const bool validate);
+                                                                    const bool validate,
+                                                                    const int64_t file_offset);
 };
diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
index b80fa2d6c8e6d..bf298b691b814 100644
--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -40,14 +40,16 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
              "Synchronous and non-parallel file read. Returns count of completed read ops",
              "buffer"_a,
              "filename"_a,
-             "validate"_a)
+             "validate"_a,
+             "file_offset"_a = 0)
 
         .def("write",
              &deepspeed_aio_handle_t::write,
              "Synchronous and non-parallel file write. Returns count of completed write ops",
              "buffer"_a,
              "filename"_a,
-             "validate"_a)
+             "validate"_a,
+             "file_offset"_a = 0)
 
         .def("pread",
              &deepspeed_aio_handle_t::pread,
@@ -55,7 +57,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
              "buffer"_a,
              "filename"_a,
              "validate"_a,
-             "async"_a)
+             "async"_a,
+             "file_offset"_a = 0)
 
         .def("pwrite",
              &deepspeed_aio_handle_t::pwrite,
@@ -63,33 +66,38 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
              "buffer"_a,
              "filename"_a,
              "validate"_a,
-             "async"_a)
+             "async"_a,
+             "file_offset"_a = 0)
 
         .def("sync_pread",
              &deepspeed_aio_handle_t::sync_pread,
              "Synchrononous parallel file read. Returns count of completed read ops",
              "buffer"_a,
-             "filename"_a)
+             "filename"_a,
+             "file_offset"_a = 0)
 
         .def("sync_pwrite",
              &deepspeed_aio_handle_t::sync_pwrite,
              "Synchronous parallel file write. Returns count of completed write ops",
              "buffer"_a,
-             "filename"_a)
+             "filename"_a,
+             "file_offset"_a = 0)
 
         .def("async_pread",
              &deepspeed_aio_handle_t::async_pread,
              "Asynchronous parallel file read. Returns 0 on success. Returns 0 on success, and "
              "following wait() returns count of completed ops.",
              "buffer"_a,
-             "filename"_a)
+             "filename"_a,
+             "file_offset"_a = 0)
 
         .def("async_pwrite",
              &deepspeed_aio_handle_t::async_pwrite,
              "Asynchronous parallel file write. Returns 0 on success, and following wait() returns "
              "count of completed ops.",
              "buffer"_a,
-             "filename"_a)
+             "filename"_a,
+             "file_offset"_a = 0)
 
         .def("new_cpu_locked_tensor",
              &deepspeed_aio_handle_t::new_cpu_locked_tensor,
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
index f4a179deb9ec6..6913e9090bf57 100755
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -92,7 +92,7 @@ def main_parallel_read(pool_params):
 
     start_time = time.time()
     dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
-    ret = handle.pread(ctxt[dest_buffer], ctxt['file'], args.validate, True)
+    ret = handle.pread(ctxt[dest_buffer], ctxt['file'], args.validate, 0, True)
     assert ret != -1
     handle.wait()
     if dest_buffer == BOUNCE_BUFFER:
diff --git a/csrc/gds/py_lib/deepspeed_gds_op.cpp b/csrc/gds/py_lib/deepspeed_gds_op.cpp
index f49f74394374e..b7055c8cc72b3 100644
--- a/csrc/gds/py_lib/deepspeed_gds_op.cpp
+++ b/csrc/gds/py_lib/deepspeed_gds_op.cpp
@@ -95,8 +95,16 @@ gds_op_desc_t::gds_op_desc_t(const bool read_op,
                              const char* filename,
                              const int64_t file_num_bytes,
                              const int intra_op_parallelism,
-                             const bool validate)
-    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, intra_op_parallelism, validate)
+                             const bool validate,
+                             const int64_t file_offset)
+    : io_op_desc_t(read_op,
+                   buffer,
+                   fd,
+                   filename,
+                   file_num_bytes,
+                   intra_op_parallelism,
+                   validate,
+                   file_offset)
 {
     _contiguous_buffer = _buffer.contiguous();
     const int64_t device = _buffer.get_device();
@@ -124,17 +132,17 @@ void gds_op_desc_t::run(const int tid,
 {
     assert(tid < _intra_op_parallelism);
     check_cudaruntimecall(cudaSetDevice(_buffer.get_device()));
-    int64_t buf_offset = data_ptr() + (_num_bytes_per_thread * tid) - (char*)_base_ptr;
-    const auto file_offset = _num_bytes_per_thread * tid;
+    const auto buf_offset = data_ptr() + (_num_bytes_per_thread * tid) - (char*)_base_ptr;
+    const auto tid_file_offset = _file_offset + (_num_bytes_per_thread * tid);
 
     if (_read_op) {
         auto ret =
-            cuFileRead(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset);
-        if (ret < 0) { _report_error(ret, errno, buf_offset); }
+            cuFileRead(_cf_handle, _base_ptr, _num_bytes_per_thread, tid_file_offset, buf_offset);
+        if (ret < 0) { _report_error(ret, errno, tid_file_offset); }
     } else {
         auto ret =
-            cuFileWrite(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset);
-        if (ret < 0) { _report_error(ret, errno, buf_offset); }
+            cuFileWrite(_cf_handle, _base_ptr, _num_bytes_per_thread, tid_file_offset, buf_offset);
+        if (ret < 0) { _report_error(ret, errno, tid_file_offset); }
     }
 }
 
diff --git a/csrc/gds/py_lib/deepspeed_gds_op.h b/csrc/gds/py_lib/deepspeed_gds_op.h
index 380bb0b9b6aed..d955527b1ba33 100644
--- a/csrc/gds/py_lib/deepspeed_gds_op.h
+++ b/csrc/gds/py_lib/deepspeed_gds_op.h
@@ -24,7 +24,8 @@ struct gds_op_desc_t : io_op_desc_t {
                   const char* filename,
                   const int64_t file_num_bytes,
                   const int intra_op_parallelism,
-                  const bool validate);
+                  const bool validate,
+                  const int64_t file_offset);
 
     void run(const int tid,
              std::unique_ptr<aio_context>& aio_ctxt,
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
index c052144a0190b..f11245c75a5e6 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
@@ -107,12 +107,19 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_gds_handle_t::_create_io_op_desc(
     const int fd,
     const char* filename,
     const int64_t file_num_bytes,
-    const bool validate)
+    const bool validate,
+    const int64_t file_offset)
 {
     if (buffer.is_cuda()) {
-        return std::make_shared<gds_op_desc_t>(
-            read_op, buffer, fd, filename, file_num_bytes, _intra_op_parallelism, validate);
+        return std::make_shared<gds_op_desc_t>(read_op,
+                                               buffer,
+                                               fd,
+                                               filename,
+                                               file_num_bytes,
+                                               _intra_op_parallelism,
+                                               validate,
+                                               file_offset);
     }
     return deepspeed_io_handle_t::_create_io_op_desc(
-        read_op, buffer, fd, filename, file_num_bytes, validate);
+        read_op, buffer, fd, filename, file_num_bytes, validate, file_offset);
 }
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
index 131e83e7b838a..25f68e177b2cb 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.h
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
@@ -42,7 +42,8 @@ struct deepspeed_gds_handle_t : deepspeed_io_handle_t {
                                                             const int fd,
                                                             const char* filename,
                                                             const int64_t file_num_bytes,
-                                                            const bool validate);
+                                                            const bool validate,
+                                                            const int64_t file_offset);
 
     static int s_cuFile_init;
 };
diff --git a/csrc/gds/py_lib/py_ds_gds.cpp b/csrc/gds/py_lib/py_ds_gds.cpp
index 57bf8d2207c45..2f165ee2c32a4 100644
--- a/csrc/gds/py_lib/py_ds_gds.cpp
+++ b/csrc/gds/py_lib/py_ds_gds.cpp
@@ -33,14 +33,16 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
              "Synchronous and non-parallel file read. Returns count of completed read ops",
              "buffer"_a,
              "filename"_a,
-             "validate"_a)
+             "validate"_a,
+             "file_offset"_a = 0)
 
         .def("write",
              &deepspeed_gds_handle_t::write,
              "Synchronous and non-parallel file write. Returns count of completed write ops",
              "buffer"_a,
              "filename"_a,
-             "validate"_a)
+             "validate"_a,
+             "file_offset"_a = 0)
 
         .def("pread",
              &deepspeed_gds_handle_t::pread,
@@ -48,7 +50,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
              "buffer"_a,
              "filename"_a,
              "validate"_a,
-             "async"_a)
+             "async"_a,
+             "file_offset"_a = 0)
 
         .def("pwrite",
              &deepspeed_gds_handle_t::pwrite,
@@ -56,33 +59,38 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
              "buffer"_a,
              "filename"_a,
              "validate"_a,
-             "async"_a)
+             "async"_a,
+             "file_offset"_a = 0)
 
         .def("sync_pread",
              &deepspeed_gds_handle_t::sync_pread,
              "Synchrononous parallel file read. Returns count of completed read ops",
              "buffer"_a,
-             "filename"_a)
+             "filename"_a,
+             "file_offset"_a = 0)
 
         .def("sync_pwrite",
              &deepspeed_gds_handle_t::sync_pwrite,
              "Synchronous parallel file write. Returns count of completed write ops",
              "buffer"_a,
-             "filename"_a)
+             "filename"_a,
+             "file_offset"_a = 0)
 
         .def("async_pread",
              &deepspeed_gds_handle_t::async_pread,
              "Asynchronous parallel file read. Returns 0 on success. Returns 0 on success, and "
              "following wait() returns count of completed ops.",
              "buffer"_a,
-             "filename"_a)
+             "filename"_a,
+             "file_offset"_a = 0)
 
         .def("async_pwrite",
              &deepspeed_gds_handle_t::async_pwrite,
              "Asynchronous parallel file write. Returns 0 on success, and following wait() returns "
              "count of completed ops.",
              "buffer"_a,
-             "filename"_a)
+             "filename"_a,
+             "file_offset"_a = 0)
 
         .def("new_cpu_locked_tensor",
              &deepspeed_gds_handle_t::new_cpu_locked_tensor,
diff --git a/csrc/includes/cpu_adagrad.h b/csrc/includes/cpu_adagrad.h
index c06d3a6b35e93..6f500250f033a 100644
--- a/csrc/includes/cpu_adagrad.h
+++ b/csrc/includes/cpu_adagrad.h
@@ -14,9 +14,9 @@
 #include "simd.h"
 
 #define STEP(SPAN)                                                           \
-    template <typename ds_params_percision_t, typename ds_state_precision_t> \
-    void Step_##SPAN(ds_params_percision_t* _params,                         \
-                     ds_params_percision_t* grads,                           \
+    template <typename ds_params_precision_t, typename ds_state_precision_t> \
+    void Step_##SPAN(ds_params_precision_t* _params,                         \
+                     ds_params_precision_t* grads,                           \
                      ds_state_precision_t* _exp_avg_sq,                      \
                      size_t _param_size);
 
@@ -28,10 +28,10 @@ class Adagrad_Optimizer {
     }
     ~Adagrad_Optimizer() {}
 #if defined(__AVX512__) or defined(__AVX256__)
-    template <int span, typename ds_params_percision_t, typename ds_state_precision_t>
+    template <int span, typename ds_params_precision_t, typename ds_state_precision_t>
     void Step_AVX(size_t* rounded_size,
-                  ds_params_percision_t* _params,
-                  ds_params_percision_t* grads,
+                  ds_params_precision_t* _params,
+                  ds_params_precision_t* grads,
                   ds_state_precision_t* _exp_avg_sq,
                   size_t param_size);
 #endif
@@ -61,15 +61,15 @@ class Adagrad_Optimizer {
 };
 
 #if defined(__AVX512__) or defined(__AVX256__)
-template <int span, typename ds_params_percision_t, typename ds_state_precision_t>
+template <int span, typename ds_params_precision_t, typename ds_state_precision_t>
 void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
-                                 ds_params_percision_t* _params,
-                                 ds_params_percision_t* grads,
+                                 ds_params_precision_t* _params,
+                                 ds_params_precision_t* grads,
                                  ds_state_precision_t* _exp_avg_sq,
                                  size_t _param_size)
 {
 #if !defined(__AVX512__)
-    if (std::is_same_v<ds_params_percision_t, c10::BFloat16> ||
+    if (std::is_same_v<ds_params_precision_t, c10::BFloat16> ||
         std::is_same_v<ds_state_precision_t, c10::BFloat16>) {
         return;
     }
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
index faf99020aee52..a7db6fda37054 100644
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
@@ -14,9 +14,9 @@
 #include "simd.h"
 
 #define STEP(SPAN)                                                           \
-    template <typename ds_params_percision_t, typename ds_state_precision_t> \
-    void Step_##SPAN(ds_params_percision_t* _params,                         \
-                     ds_params_percision_t* grads,                           \
+    template <typename ds_params_precision_t, typename ds_state_precision_t> \
+    void Step_##SPAN(ds_params_precision_t* _params,                         \
+                     ds_params_precision_t* grads,                           \
                      ds_state_precision_t* _exp_avg,                         \
                      ds_state_precision_t* _exp_avg_sq,                      \
                      size_t _param_size);
@@ -43,10 +43,10 @@ class Adam_Optimizer {
     ~Adam_Optimizer() {}
 
 #if defined(__AVX512__) or defined(__AVX256__)
-    template <int span, typename ds_params_percision_t, typename ds_state_precision_t>
+    template <int span, typename ds_params_precision_t, typename ds_state_precision_t>
     void Step_AVX(size_t* rounded_size,
-                  ds_params_percision_t* _params,
-                  ds_params_percision_t* grads,
+                  ds_params_precision_t* _params,
+                  ds_params_precision_t* grads,
                   ds_state_precision_t* _exp_avg,
                   ds_state_precision_t* _exp_avg_sq,
                   size_t param_size);
@@ -106,16 +106,16 @@ class Adam_Optimizer {
 };
 
 #if defined(__AVX512__) or defined(__AVX256__)
-template <int span, typename ds_params_percision_t, typename ds_state_precision_t>
+template <int span, typename ds_params_precision_t, typename ds_state_precision_t>
 void Adam_Optimizer::Step_AVX(size_t* rounded_size,
-                              ds_params_percision_t* _params,
-                              ds_params_percision_t* grads,
+                              ds_params_precision_t* _params,
+                              ds_params_precision_t* grads,
                               ds_state_precision_t* _exp_avg,
                               ds_state_precision_t* _exp_avg_sq,
                               size_t _param_size)
 {
 #if !defined(__AVX512__)
-    if (std::is_same_v<ds_params_percision_t, c10::BFloat16> ||
+    if (std::is_same_v<ds_params_precision_t, c10::BFloat16> ||
         std::is_same_v<ds_state_precision_t, c10::BFloat16>) {
         return;
     }
diff --git a/csrc/includes/cpu_lion.h b/csrc/includes/cpu_lion.h
index 62b3049232222..beaf357a32117 100644
--- a/csrc/includes/cpu_lion.h
+++ b/csrc/includes/cpu_lion.h
@@ -14,9 +14,9 @@
 #include "simd.h"
 
 #define STEP(SPAN)                                                           \
-    template <typename ds_params_percision_t, typename ds_state_precision_t> \
-    void Step_##SPAN(ds_params_percision_t* _params,                         \
-                     ds_params_percision_t* grads,                           \
+    template <typename ds_params_precision_t, typename ds_state_precision_t> \
+    void Step_##SPAN(ds_params_precision_t* _params,                         \
+                     ds_params_precision_t* grads,                           \
                      ds_state_precision_t* _exp_avg,                         \
                      size_t _param_size);
 
@@ -32,10 +32,10 @@ class Lion_Optimizer {
     ~Lion_Optimizer() {}
 
 #if defined(__AVX512__) or defined(__AVX256__)
-    template <int span, typename ds_params_percision_t, typename ds_state_precision_t>
+    template <int span, typename ds_params_precision_t, typename ds_state_precision_t>
     void Step_AVX(size_t* rounded_size,
-                  ds_params_percision_t* _params,
-                  ds_params_percision_t* grads,
+                  ds_params_precision_t* _params,
+                  ds_params_precision_t* grads,
                   ds_state_precision_t* _exp_avg,
                   size_t param_size);
 #endif
@@ -67,15 +67,15 @@ class Lion_Optimizer {
 };
 
 #if defined(__AVX512__) or defined(__AVX256__)
-template <int span, typename ds_params_percision_t, typename ds_state_precision_t>
+template <int span, typename ds_params_precision_t, typename ds_state_precision_t>
 void Lion_Optimizer::Step_AVX(size_t* rounded_size,
-                              ds_params_percision_t* _params,
-                              ds_params_percision_t* grads,
+                              ds_params_precision_t* _params,
+                              ds_params_precision_t* grads,
                               ds_state_precision_t* _exp_avg,
                               size_t _param_size)
 {
 #if !defined(__AVX512__)
-    if (std::is_same_v<ds_params_percision_t, c10::BFloat16> ||
+    if (std::is_same_v<ds_params_precision_t, c10::BFloat16> ||
         std::is_same_v<ds_state_precision_t, c10::BFloat16>) {
         return;
     }
diff --git a/csrc/lion/cpu_lion_impl.cpp b/csrc/lion/cpu_lion_impl.cpp
index 85896ba86e194..6a98162314f9f 100644
--- a/csrc/lion/cpu_lion_impl.cpp
+++ b/csrc/lion/cpu_lion_impl.cpp
@@ -19,9 +19,9 @@ static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
 
 // C++ interface
 
-template <typename ds_params_percision_t, typename ds_state_precision_t>
-void Lion_Optimizer::Step_1(ds_params_percision_t* _params,
-                            ds_params_percision_t* grads,
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Lion_Optimizer::Step_1(ds_params_precision_t* _params,
+                            ds_params_precision_t* grads,
                             ds_state_precision_t* _exp_avg,
                             size_t _param_size)
 {
@@ -64,9 +64,9 @@ void Lion_Optimizer::Step_1(ds_params_percision_t* _params,
     }
 }
 
-template <typename ds_params_percision_t, typename ds_state_precision_t>
-void Lion_Optimizer::Step_4(ds_params_percision_t* _params,
-                            ds_params_percision_t* grads,
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Lion_Optimizer::Step_4(ds_params_precision_t* _params,
+                            ds_params_precision_t* grads,
                             ds_state_precision_t* _exp_avg,
                             size_t _param_size)
 {
@@ -117,9 +117,9 @@ int create_lion_optimizer(int optimizer_id,
     return 0;
 }
 
-template <typename ds_params_percision_t, typename ds_state_precision_t>
-void Lion_Optimizer::Step_8(ds_params_percision_t* _params,
-                            ds_params_percision_t* grads,
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Lion_Optimizer::Step_8(ds_params_precision_t* _params,
+                            ds_params_precision_t* grads,
                             ds_state_precision_t* _exp_avg,
                             size_t _param_size)
 {
@@ -134,15 +134,15 @@ void Lion_Optimizer::Step_8(ds_params_percision_t* _params,
                (_param_size - rounded_size));
 }
 
-template <typename ds_params_percision_t, typename ds_state_precision_t>
+template <typename ds_params_precision_t, typename ds_state_precision_t>
 void step_invoker(std::shared_ptr<Lion_Optimizer> opt,
                   void* _params,
                   void* grads,
                   void* _exp_avg,
                   size_t _param_size)
 {
-    opt->Step_8((ds_params_percision_t*)(_params),
-                (ds_params_percision_t*)(grads),
+    opt->Step_8((ds_params_precision_t*)(_params),
+                (ds_params_precision_t*)(grads),
                 (ds_state_precision_t*)(_exp_avg),
                 _param_size);
 }
@@ -152,12 +152,12 @@ std::map<std::tuple<c10::ScalarType, c10::ScalarType>,
     invokers;
 
 // Fill map with template functions for each type
-template <class ds_params_percision_t, class ds_state_precision_t>
+template <class ds_params_precision_t, class ds_state_precision_t>
 void create_invoker()
 {
-    invokers[std::tuple(c10::CppTypeToScalarType<ds_params_percision_t>(),
+    invokers[std::tuple(c10::CppTypeToScalarType<ds_params_precision_t>(),
                         c10::CppTypeToScalarType<ds_state_precision_t>())] =
-        step_invoker<ds_params_percision_t, ds_state_precision_t>;
+        step_invoker<ds_params_precision_t, ds_state_precision_t>;
 }
 struct InvokerInitializer {
     InvokerInitializer()
diff --git a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
index 25a494111c54b..bbb8a7f00b1f5 100644
--- a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
+++ b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
@@ -101,7 +101,15 @@ __global__ void apply_rotary_pos_half(T* mixed_query,
 
 #if defined(__HIP_PLATFORM_AMD__) and ROCM_WAVEFRONT_SIZE == 64
 #define LAUNCH_FOR_ALIGNMENT(ALIGNMENT)         \
-    if (threads_per_head == 64) {               \
+    if (threads_per_head == 4) {                \
+        LAUNCH_ROT_POS_EMB_HALF(4, ALIGNMENT);  \
+    } else if (threads_per_head == 8) {         \
+        LAUNCH_ROT_POS_EMB_HALF(8, ALIGNMENT);  \
+    } else if (threads_per_head == 16) {        \
+        LAUNCH_ROT_POS_EMB_HALF(16, ALIGNMENT); \
+    } else if (threads_per_head == 32) {        \
+        LAUNCH_ROT_POS_EMB_HALF(32, ALIGNMENT); \
+    } else if (threads_per_head == 64) {        \
         LAUNCH_ROT_POS_EMB_HALF(64, ALIGNMENT); \
     } else {                                    \
         assert(false);                          \
diff --git a/csrc/xpu/adam/multi_tensor_apply.dp.hpp b/csrc/xpu/adam/multi_tensor_apply.dp.hpp
index 01964e1a4d385..14a130e2a23e9 100644
--- a/csrc/xpu/adam/multi_tensor_apply.dp.hpp
+++ b/csrc/xpu/adam/multi_tensor_apply.dp.hpp
@@ -109,6 +109,12 @@ class multi_tensor_apply_kernel {
     std::tuple<ArgTypes...> args;
 };
 
+// to make sure multi_tensor_apply_kernel can be used in sycl::buffer
+namespace sycl {
+template <typename T, typename U, typename... ArgTypes>
+struct is_device_copyable<multi_tensor_apply_kernel<T, U, ArgTypes...>> : std::true_type {};
+}  // namespace sycl
+
 template <int depth, typename T, typename... ArgTypes>
 void multi_tensor_apply(int block_size,
                         int chunk_size,
diff --git a/csrc/xpu/aio/deepspeed_cpu_op.cpp b/csrc/xpu/aio/deepspeed_cpu_op.cpp
deleted file mode 100644
index ee98c2d5cac2f..0000000000000
--- a/csrc/xpu/aio/deepspeed_cpu_op.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// SPDX-License-Identifier: Apache-2.0
-
-// DeepSpeed Team
-
-#include "deepspeed_cpu_op.h"
-
-using namespace std;
-
-cpu_op_desc_t::cpu_op_desc_t(const bool read_op,
-                             const torch::Tensor& buffer,
-                             const int fd,
-                             const char* filename,
-                             const long long int file_num_bytes,
-                             const int num_threads,
-                             const bool validate)
-    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate),
-      _cpu_buffer(buffer)
-{
-    // XPU don't handle buffer here. See XPU Accelerator pin_memory.
-    _contiguous_buffer = _cpu_buffer.contiguous();
-}
-
-char* cpu_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
-
-void cpu_op_desc_t::finish()
-{
-    if (_read_op && _buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); }
-}
-
-void cpu_op_desc_t::validate()
-{
-    validate_aio_operation(_read_op, _filename.c_str(), data_ptr(), _file_num_bytes);
-}
-
-void cpu_op_desc_t::run(const int tid,
-                        std::unique_ptr<aio_context>& aio_ctxt,
-                        deepspeed_aio_config_t* aio_config)
-{
-    assert(tid < _num_threads);
-    const auto base_offset = _num_bytes_per_thread * tid;
-
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
-        new io_xfer_ctxt(_fd, base_offset, _num_bytes_per_thread, data_ptr()));
-
-    if (aio_config->_overlap_events) {
-        do_aio_operation_overlap(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr);
-    } else {
-        do_aio_operation_sequential(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr);
-    }
-}
diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
index d8655299282f7..a8d15cd5332bc 100755
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -42,7 +42,7 @@
 from .utils import log_dist, OnDevice, logger
 from .comm.comm import init_distributed
 
-from .runtime import zero
+from .runtime import zero, domino
 from .runtime.compiler import is_compile_supported
 
 from .pipe import PipelineModule
@@ -165,8 +165,8 @@ def initialize(args=None,
     if hasattr(args, "deepscale_config") and args.deepscale_config is not None:
         logger.warning("************ --deepscale_config is deprecated, please use --deepspeed_config ************")
         if hasattr(args, "deepspeed_config"):
-            assert (args.deepspeed_config is
-                    None), "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config"
+            assert (args.deepspeed_config
+                    is None), "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config"
         args.deepspeed_config = args.deepscale_config
         args.deepscale_config = None
 
diff --git a/deepspeed/autotuning/README.md b/deepspeed/autotuning/README.md
index b1fa435364d2a..fc76ed1e9f8d7 100755
--- a/deepspeed/autotuning/README.md
+++ b/deepspeed/autotuning/README.md
@@ -336,7 +336,7 @@ The Autotuner stops exploring the space when any of the following conditions mee
 
 ## Using Autotuning with Hugging Face
 
-Hugging Face users can set some configurations values to ["auto"](https://huggingface.co/transformers/main_classes/deepspeed.html?highlight=gradient_accumulation_steps#shared-configuration).
+Hugging Face users can set some configurations values to ["auto"](https://huggingface.co/docs/transformers/deepspeed#deepspeed-and-trainer-parameters).
 `"auto"` means the value will be set to the default in Hugging Face or be overwritten using the supplied values from the command line arguments.
 In DeepSpeed Autotuning, if the user-provided DeepSpeed configuration file has "auto" keywords, they are treated as the value "auto".
 
diff --git a/deepspeed/autotuning/autotuner.py b/deepspeed/autotuning/autotuner.py
index dfd195bc37ebd..a72b3c951e973 100755
--- a/deepspeed/autotuning/autotuner.py
+++ b/deepspeed/autotuning/autotuner.py
@@ -248,8 +248,8 @@ def mp_size(self):
         return self.autotuning_config.mp_size
 
     def max_train_micro_batch_size_per_gpu(self):
-        if self.max_train_batch_size(
-        ) and self.max_train_batch_size() > 0:  # if the user specifies a max_train_batch_size
+        if self.max_train_batch_size() and self.max_train_batch_size(
+        ) > 0:  # if the user specifies a max_train_batch_size
             max_train_micro_batch_size = self.max_train_batch_size() * self.mp_size() // (
                 self.exp_num_gpus * self.exp_num_nodes)  # gradient accumulation steps >=1
             return min(self.autotuning_config.max_train_micro_batch_size_per_gpu, max_train_micro_batch_size)
@@ -964,8 +964,8 @@ def get_min_max_micro_batch_size(self, stage, min_micro_batch_size, calculated_m
                     low = mid + 1
                     self.update_records(tuning_space_name, exp, metric_val, 1)
                     used_micro_batch_sizes.append(mid)
-                    if prev_metric_val and (
-                        (metric_val - prev_metric_val) / prev_metric_val) < METRIC_PERCENT_DIFF_CONST:
+                    if prev_metric_val and ((metric_val - prev_metric_val) /
+                                            prev_metric_val) < METRIC_PERCENT_DIFF_CONST:
                         logger.info(f"performance plateaus at mbs = {low}")
                         break
                     prev_metric_val = metric_val
@@ -1026,8 +1026,8 @@ def get_tuning_micro_batch_size_list(self, min_micro_batch_size, max_micro_batch
         # NUM_GPUS=$(( ${NUM_WORKERS} * ${NUM_GPUS_PER_WORKER} ))
         # DP_SIZE=$(( ${NUM_GPUS} / (${PP_SIZE} * ${MP_SIZE}) ))
         # GRAD_ACC_STEPS=$(( ${TARGET_GLOBAL_BATCH_SIZE} / (${BATCH_SIZE} * ${DP_SIZE}) ))
-        if self.max_train_batch_size(
-        ) and self.max_train_batch_size() > 0:  # if the user specifies a max_train_batch_size
+        if self.max_train_batch_size() and self.max_train_batch_size(
+        ) > 0:  # if the user specifies a max_train_batch_size
             max_train_batch_size_per_gpu = self.max_train_batch_size() * self.mp_size() // (self.exp_num_gpus *
                                                                                             self.exp_num_nodes)
         else:
diff --git a/deepspeed/checkpoint/deepspeed_checkpoint.py b/deepspeed/checkpoint/deepspeed_checkpoint.py
index 31997177a2623..9a368b7a0a254 100644
--- a/deepspeed/checkpoint/deepspeed_checkpoint.py
+++ b/deepspeed/checkpoint/deepspeed_checkpoint.py
@@ -116,7 +116,7 @@ def show_transformer_file_map(self):
         self._dump_mapping(self.transformer_file_map, 'rank_to_transformer_files')
 
     def _build_global_state(self):
-        sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
+        sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
         self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0)
         self.global_state[ARGS_KEY] = sd.get(ARGS_KEY, None)
 
@@ -137,14 +137,17 @@ def get_final_norm_layer_id(self):
 
     def get_iteration(self):
         if not ITERATION_KEY in self.global_state:
-            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
+            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
             self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0)
 
         return self.global_state[ITERATION_KEY]
 
     def get_embedding_state(self, tp_index: int) -> Dict:
         assert tp_index in self.tp_to_embedding_map.keys()
-        sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in self.tp_to_embedding_map[tp_index]]
+        sd_list = [
+            torch.load(fname, map_location=torch.device('cpu'), weights_only=False)
+            for fname in self.tp_to_embedding_map[tp_index]
+        ]
         sd = self._merge_state_dicts(sd_list)
         return sd
 
@@ -154,7 +157,7 @@ def get_embedding_files(self, tp_index: int) -> list:
 
     def _get_checkpoint_value(self, key):
         if not key in self.global_state:
-            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
+            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
             self.global_state[key] = sd.get(key, None)
 
         return self.global_state[key]
@@ -169,7 +172,7 @@ def get_2d_parallel_state(self, tp_index: int, pp_index: int) -> dict:
         assert tp_index < self.tp_degree
         assert pp_index < self.pp_degree
         fname_list = self.get_2d_parallel_files(tp_index=tp_index, pp_index=pp_index)
-        sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list]
+        sd_list = [torch.load(fname, map_location=torch.device('cpu'), weights_only=False) for fname in fname_list]
 
         merged_sd = None
         for sd in sd_list:
@@ -185,7 +188,7 @@ def get_transformer_state(self, tp_index: int, pp_index: int) -> list:
         assert pp_index < self.pp_degree
         t_list = []
         for fname_list in self.transformer_file_map[(tp_index, pp_index)]:
-            sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list]
+            sd_list = [torch.load(fname, map_location=torch.device('cpu'), weights_only=False) for fname in fname_list]
             sd = self._merge_state_dicts(sd_list)
             t_list.append(sd)
         return t_list
@@ -196,7 +199,7 @@ def get_pp_transformer_map(self, pp_index: int) -> list:
 
     def get_final_norm_state(self, tp_index: int) -> Dict:
         assert tp_index in self.tp_to_final_norm_map.keys()
-        sd = torch.load(self.tp_to_final_norm_map[tp_index][0], map_location=torch.device('cpu'))
+        sd = torch.load(self.tp_to_final_norm_map[tp_index][0], map_location=torch.device('cpu'), weights_only=False)
         return sd
 
     def get_final_norm_files(self, tp_index: int) -> list:
diff --git a/deepspeed/checkpoint/ds_to_universal.py b/deepspeed/checkpoint/ds_to_universal.py
index e5974a30df22e..f7b75eee66d06 100755
--- a/deepspeed/checkpoint/ds_to_universal.py
+++ b/deepspeed/checkpoint/ds_to_universal.py
@@ -150,7 +150,7 @@ def extract_zero_shards(dir, ds_checkpoint, indices_3D):
 
 
 def extract_zero_shards_stage3(optim_files, param_shapes, dp_degree, temp_dir, dp_index):
-    state_dict = torch.load(optim_files[dp_index], map_location='cpu')
+    state_dict = torch.load(optim_files[dp_index], map_location='cpu', weights_only=False)
 
     flat_state = dict(
         exp_avg=state_dict[OPTIMIZER_STATE_DICT]['optimizer_state_dict']['state'][0]["exp_avg"],
@@ -214,7 +214,7 @@ def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape=None):
                 raise ValueError(f"Cannot parse dp_rank from {p}")
 
         paths = [f"{prefix_path}.{dp_index_to_str(dp_index)}" for dp_index in sorted(list(dp_indices))]
-        shards = [torch.load(p) for p in paths]
+        shards = [torch.load(p, weights_only=False) for p in paths]
 
         if state == "step":
             assert all(v == shards[0] for v in shards), "All shards must have the same step value"
@@ -404,7 +404,7 @@ def _zero_partitioned_param_info(unpartitioned_numel, world_size):
 
 
 def _parse_model_states_stage3(files):
-    return torch.load(files[0], map_location=torch.device('cpu'))[PARAM_SHAPES]
+    return torch.load(files[0], map_location=torch.device('cpu'), weights_only=False)[PARAM_SHAPES]
 
 
 def _save_optimizer_state(args, ds_checkpoint):
@@ -420,7 +420,7 @@ def _save_optimizer_state(args, ds_checkpoint):
 
 
 def _save_optimizer_state_stage3(args, optim_files):
-    sd = torch.load(optim_files[0], map_location=torch.device('cpu'))
+    sd = torch.load(optim_files[0], map_location=torch.device('cpu'), weights_only=False)
     output_sd = sd[OPTIMIZER_STATE_DICT]
     output_sd[PARAM_GROUPS] = output_sd[OPTIMIZER_STATE_DICT][PARAM_GROUPS]
     zero_output_folder = os.path.join(args.output_folder, "zero")
@@ -446,7 +446,7 @@ def _get_checkpoint_files(checkpoint_dir, glob_pattern):
 
 
 def _get_zero_stage(optim_files):
-    state_dict = torch.load(optim_files[0], map_location=torch.device('cpu'))
+    state_dict = torch.load(optim_files[0], map_location=torch.device('cpu'), weights_only=False)
     optimizer_state = state_dict[OPTIMIZER_STATE_DICT]
     zero_stage = optimizer_state.get(ZERO_STAGE, 1)
     return zero_stage
@@ -454,7 +454,7 @@ def _get_zero_stage(optim_files):
 
 def _inject_missing_state(ds_checkpoint):
     if UNIVERSAL_CHECKPOINT_INFO not in ds_checkpoint.global_state:
-        sd = torch.load(ds_checkpoint.mp_rank_files[0], map_location=torch.device('cpu'))
+        sd = torch.load(ds_checkpoint.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
         if UNIVERSAL_CHECKPOINT_INFO not in sd:
             ds_checkpoint.global_state[UNIVERSAL_CHECKPOINT_INFO] = {}
             ds_checkpoint.global_state[UNIVERSAL_CHECKPOINT_INFO][
@@ -488,7 +488,7 @@ def main(args):
 
         slice_shapes = []
         for mp_rank_file in ds_checkpoint.mp_rank_files:
-            mp_sd = torch.load(mp_rank_file, map_location=torch.device('cpu'))
+            mp_sd = torch.load(mp_rank_file, map_location=torch.device('cpu'), weights_only=False)
             slice_shapes += mp_sd[PARAM_SHAPES]
 
         # fix back to normal flat dict, merge duplicates for tp>1
diff --git a/deepspeed/checkpoint/universal_checkpoint.py b/deepspeed/checkpoint/universal_checkpoint.py
index 064891a8bb540..266d5a0635951 100644
--- a/deepspeed/checkpoint/universal_checkpoint.py
+++ b/deepspeed/checkpoint/universal_checkpoint.py
@@ -34,7 +34,7 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
     step = None
     for key in hp_keys:
         ckpt_file = os.path.join(folder, f"{key}.pt")
-        ckpt_dict = torch.load(ckpt_file)
+        ckpt_dict = torch.load(ckpt_file, weights_only=False)
 
         if key == "step":
             step = ckpt_dict
diff --git a/deepspeed/checkpoint/utils.py b/deepspeed/checkpoint/utils.py
index c305e8884e83b..5964da00728e4 100644
--- a/deepspeed/checkpoint/utils.py
+++ b/deepspeed/checkpoint/utils.py
@@ -51,7 +51,12 @@ def clone_tensors_for_torch_save(item, device=torch.device('cpu')):
         - copy of ``item`` with cloned tensors on target device
     """
     if torch.is_tensor(item):
-        return item.detach().clone().to(device)
+        if type(device) is str:
+            device = torch.device(device)
+        if device == item.device:
+            return item.detach().clone()
+        else:
+            return item.detach().to(device)
     elif isinstance(item, list):
         return [clone_tensors_for_torch_save(v, device) for v in item]
     elif isinstance(item, tuple):
diff --git a/deepspeed/checkpoint/zero_checkpoint.py b/deepspeed/checkpoint/zero_checkpoint.py
index 6730b93dfd4fb..c85f0241005d1 100644
--- a/deepspeed/checkpoint/zero_checkpoint.py
+++ b/deepspeed/checkpoint/zero_checkpoint.py
@@ -54,7 +54,7 @@ def get_state_for_rank(self, pp_index, tp_index, dp_index, keys_to_ignore=[], st
         state_file_list = self.get_files_for_rank(pp_index, tp_index, dp_index)
         merged_sd = None
         for state_file in state_file_list:
-            sd = torch.load(state_file, map_location=torch.device('cpu'))
+            sd = torch.load(state_file, map_location=torch.device('cpu'), weights_only=False)
             for key in keys_to_ignore:
                 sd.pop(key, None)
 
diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py
index ed2645d415c41..988b74232bb92 100755
--- a/deepspeed/comm/torch.py
+++ b/deepspeed/comm/torch.py
@@ -390,9 +390,14 @@ def init_device_mesh(self, mesh_shape, mesh_dim_names):
         if not required_torch_version(min_version=2.2):
             raise RuntimeError(f"Current torch version does not have device mesh"
                                f"api (torch.__version__: {torch.__version__})")
-        return torch.distributed.device_mesh.init_device_mesh(get_accelerator().current_device_name(),
-                                                              mesh_shape,
-                                                              mesh_dim_names=mesh_dim_names)
+        if not required_torch_version(max_version=2.4):
+            return torch.distributed.device_mesh.init_device_mesh(get_accelerator().device_name(),
+                                                                  mesh_shape,
+                                                                  mesh_dim_names=mesh_dim_names)
+        else:
+            return torch.distributed.device_mesh.init_device_mesh(get_accelerator().current_device_name(),
+                                                                  mesh_shape,
+                                                                  mesh_dim_names=mesh_dim_names)
 
 
 # This will become a light-weight wrapper around torch.distributed functions
diff --git a/deepspeed/elasticity/elastic_agent.py b/deepspeed/elasticity/elastic_agent.py
index c6a69dd2a49fb..8fd4293d312c0 100644
--- a/deepspeed/elasticity/elastic_agent.py
+++ b/deepspeed/elasticity/elastic_agent.py
@@ -160,8 +160,8 @@ def _invoke_run(self, role: str = "default") -> RunResult:
                          f" Waiting {self._exit_barrier_timeout} seconds for other agents to finish.")
                 self._exit_barrier()
                 return run_result
-            elif state in {WorkerState.UNHEALTHY, WorkerState.FAILED
-                           } or len(participants) > len(rdzv_handler._state_holder.state.participants):
+            elif state in {WorkerState.UNHEALTHY, WorkerState.FAILED} or len(participants) > len(
+                    rdzv_handler._state_holder.state.participants):
                 if self._remaining_restarts > 0:
                     log.info(f"[{role}] Worker group {state.name}. "
                              f"{self._remaining_restarts}/{spec.max_restarts} attempts left;"
diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
index 6574d49fb1323..cfca1ff4fe4cf 100755
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@@ -452,7 +452,7 @@ def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None):
         checkpoint = sd_loader['checkpoints']
 
         if type(checkpoint) is list:
-            self.sd = torch.load(checkpoint[0], map_location='cpu')
+            self.sd = torch.load(checkpoint[0], map_location='cpu', weights_only=False)
             self.key_list = list(self.sd.keys())
 
             self.load_model_with_checkpoint(self.module)
@@ -460,7 +460,7 @@ def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None):
             for i in range(1, len(checkpoint)):
                 if not dist.is_initialized() or dist.get_rank() == 0:
                     print(f"loading checkpoint ({i})")
-                self.sd = torch.load(checkpoint[i], map_location=get_accelerator().device_name())
+                self.sd = torch.load(checkpoint[i], map_location=get_accelerator().device_name(), weights_only=False)
                 self.key_list = list(self.sd.keys())
                 self.load_model_with_checkpoint(self.module)
         else:
diff --git a/deepspeed/inference/v2/checkpoint/huggingface_engine.py b/deepspeed/inference/v2/checkpoint/huggingface_engine.py
index d88d99ebebfd1..b17bb886838f1 100644
--- a/deepspeed/inference/v2/checkpoint/huggingface_engine.py
+++ b/deepspeed/inference/v2/checkpoint/huggingface_engine.py
@@ -80,7 +80,7 @@ def model_has_safetensors(model_name_or_path: str) -> bool:
         else:
             model_param_json_fname = "pytorch_model.bin.index.json"
             model_file_fname = "pytorch_model.bin"
-            self._checkpoint_load_fn = partial(torch.load, map_location="cpu")
+            self._checkpoint_load_fn = partial(torch.load, map_location="cpu", weights_only=False)
 
         model_param_json = os.path.join(self._local_checkpoint_dir, model_param_json_fname)
 
diff --git a/deepspeed/inference/v2/model_implementations/inference_policy_base.py b/deepspeed/inference/v2/model_implementations/inference_policy_base.py
index d5a326c03599e..2f4266a8cb880 100644
--- a/deepspeed/inference/v2/model_implementations/inference_policy_base.py
+++ b/deepspeed/inference/v2/model_implementations/inference_policy_base.py
@@ -205,7 +205,7 @@ def populate_model_parameters(self) -> None:
             buffer_path = make_param_filename(self._inf_checkpoint_path, self.model.tp_rank, self.model.tp_size)
             metadata_path = make_metadata_filename(self._inf_checkpoint_path, self.model.tp_rank, self.model.tp_size)
 
-            buffer = torch.load(buffer_path)
+            buffer = torch.load(buffer_path, weights_only=False)
             metadata = json.load(open(metadata_path, "r"))
             metadata = ModelMetadata.parse_raw(metadata)
 
diff --git a/deepspeed/inference/v2/model_implementations/layer_container_base.py b/deepspeed/inference/v2/model_implementations/layer_container_base.py
index f26c875566650..feb65b4a5f5d1 100644
--- a/deepspeed/inference/v2/model_implementations/layer_container_base.py
+++ b/deepspeed/inference/v2/model_implementations/layer_container_base.py
@@ -14,7 +14,7 @@
 
 # Currently have dependency loops for the type hints.
 InferenceModel = Type["InferenceModel"]
-LayerContainer = Type["LayerContainer"]
+LayerContainer = Type["LayerContainer"]  # noqa: F811
 
 MAPPING_KEY = "PARAM_MAPPING"
 PLIST_HELPERS = "_ds_plist_strip_vals"
@@ -161,7 +161,7 @@ def __call__(cls, *args, **kwargs):
         return instance
 
 
-class LayerContainer(metaclass=LayerMetaclass):
+class LayerContainer(metaclass=LayerMetaclass):  # noqa: F811
     """
     Abstract base class for containing model parameters.
 
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index cf70c4530c822..7afe6ca903fb2 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -277,8 +277,10 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
         if hasattr(model_config, "vision_config"):
             if "MllamaVisionEncoderLayer" in str(module):
                 num_kv_heads = _autotp.get_model_num_kv_heads(model_config.vision_config)
-            else:
+            elif hasattr(model_config, "text_config"):
                 num_kv_heads = _autotp.get_model_num_kv_heads(model_config.text_config)
+            else:
+                num_kv_heads = _autotp.get_model_num_kv_heads(model_config)
         else:
             num_kv_heads = _autotp.get_model_num_kv_heads(model_config)
 
@@ -345,7 +347,7 @@ def set_lm_head(module):
                                                       "weight") and not module.embed_out.weight.is_meta and isinstance(
                                                           module.embed_out, torch.nn.Linear):
             module = replace_wo_policy(module, ("embed_out", ), 0, "embed_out")
-        elif hasattr(module.language_model, "lm_head"):
+        elif hasattr(module, "language_model") and hasattr(module.language_model, "lm_head"):
             module = replace_wo_policy(module.language_model, ("lm_head", ), 0, "lm_head")
         return module
 
@@ -413,7 +415,7 @@ def conv2d_parallel_shard_weights(model, rank, world_size):
             pbar = tqdm.tqdm(total=len(checkpoint), desc=f"Loading {len(checkpoint)} checkpoint shards")
 
             for i in range(len(checkpoint)):
-                sd = [torch.load(os.path.join(base_dir1, checkpoint[i]), map_location='cpu')]
+                sd = [torch.load(os.path.join(base_dir1, checkpoint[i]), map_location='cpu', weights_only=False)]
                 load_model_with_checkpoint(replaced_module,
                                            sd,
                                            mp_replace,
@@ -435,7 +437,7 @@ def conv2d_parallel_shard_weights(model, rank, world_size):
                     os.path.join(base_dir1, ckpt_list[ckpt_index + j]) if base_dir1 else ckpt_list[ckpt_index + j]
                     for j in range(sd_count)
                 ]
-                sds = [torch.load(ckpt_file, map_location='cpu') for ckpt_file in ckpt_files]
+                sds = [torch.load(ckpt_file, map_location='cpu', weights_only=False) for ckpt_file in ckpt_files]
                 load_model_with_checkpoint(replaced_module,
                                            sds,
                                            mp_replace,
@@ -455,7 +457,7 @@ def conv2d_parallel_shard_weights(model, rank, world_size):
                     pbar.update(1)
                     ckpt_file = os.path.join(base_dir1,
                                              checkpoint["non_tp"][i]) if base_dir1 else checkpoint["non_tp"][i]
-                    sds = [torch.load(ckpt_file, map_location='cpu')]
+                    sds = [torch.load(ckpt_file, map_location='cpu', weights_only=False)]
                     load_model_with_checkpoint(replaced_module,
                                                sds,
                                                mp_replace,
@@ -494,9 +496,10 @@ def conv2d_parallel_shard_weights(model, rank, world_size):
         if not dist.is_initialized() or dist.get_rank() == 0:
             print("Saving tp-sharded checkpoints")
             torch.save(
-                OrderedDict({k: v
-                             for k, v in dict(replaced_module.state_dict()).items()
-                             if transformer_name not in k}), f'{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}')
+                OrderedDict({
+                    k: v
+                    for k, v in dict(replaced_module.state_dict()).items() if transformer_name not in k
+                }), f'{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}')
 
             dtype_reprs = {
                 torch.float32: 'float32',
@@ -621,7 +624,7 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No
             from safetensors.torch import load_file
             sd = load_file(checkpoint)
         else:
-            sd = torch.load(checkpoint, map_location='cpu')
+            sd = torch.load(checkpoint, map_location='cpu', weights_only=False)
 
     policy = {}
     if orig_class is not None:
diff --git a/deepspeed/module_inject/tp_shard.py b/deepspeed/module_inject/tp_shard.py
index 6758c7a657f63..57be0c7938560 100644
--- a/deepspeed/module_inject/tp_shard.py
+++ b/deepspeed/module_inject/tp_shard.py
@@ -24,7 +24,9 @@ def set_n_embd(num):
 
 def get_num_kv_heads():
     global num_kv_heads
-    return num_kv_heads
+    if 'num_kv_heads' in globals():
+        return num_kv_heads
+    return None
 
 
 def get_num_attention_heads():
diff --git a/deepspeed/profiling/flops_profiler/profiler.py b/deepspeed/profiling/flops_profiler/profiler.py
index 96306184e42cf..f87f1beb7e4eb 100644
--- a/deepspeed/profiling/flops_profiler/profiler.py
+++ b/deepspeed/profiling/flops_profiler/profiler.py
@@ -115,7 +115,7 @@ def start_time_hook(module, input):
                 get_accelerator().synchronize()
                 module.__start_time__ = time.time()
 
-            if not hasattr(module, "__start_time_hook_handle"):
+            if not hasattr(module, "__start_time_hook_handle__"):
                 module.__start_time_hook_handle__ = module.register_forward_pre_hook(start_time_hook)
 
             def end_time_hook(module, input, output):
diff --git a/deepspeed/runtime/base_optimizer.py b/deepspeed/runtime/base_optimizer.py
index 6cfd66f1cc38a..b8df7499450db 100644
--- a/deepspeed/runtime/base_optimizer.py
+++ b/deepspeed/runtime/base_optimizer.py
@@ -22,7 +22,7 @@ def load_hp_checkpoint_state_from_checkpoint_dir(self, lp_groups_name: str, chec
         optim_state_path = os.path.join(checkpoint_dir, "optimizer_state.pt")
         assert os.path.isfile(
             optim_state_path), f'{optim_state_path} containing optimizer global state is missing! Cannot proceed.'
-        optim_sd = torch.load(optim_state_path)
+        optim_sd = torch.load(optim_state_path, weights_only=False)
 
         self._load_global_state(optim_sd)
 
diff --git a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
index e26e3243c4b5d..e834bf0d22d79 100644
--- a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
+++ b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
@@ -58,7 +58,7 @@ def load(self, path: str, map_location=None):
         if not self.enable_nebula_load and first_load_flag:
             self.tag_flag = tag
             logger.info(f"[Nebula] Disable nebula load. Loading checkpoint from {path} ...")
-            partition = torch.load(path, map_location=map_location)
+            partition = torch.load(path, map_location=map_location, weights_only=False)
             logger.info(f"[Nebula] Disable nebula load. Loaded checkpoint from {path} .")
             return partition
 
diff --git a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
index 5cd44864bb2ea..076c638532ad3 100644
--- a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
+++ b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
@@ -25,7 +25,7 @@ def save(self, state_dict, path: str):
 
     def load(self, path: str, map_location=None):
         logger.info(f"[Torch] Loading checkpoint from {path}...")
-        partition = torch.load(path, map_location=map_location)
+        partition = torch.load(path, map_location=map_location, weights_only=False)
         logger.info(f"[Torch] Loaded checkpoint from {path}.")
         return partition
 
diff --git a/deepspeed/runtime/compiler.py b/deepspeed/runtime/compiler.py
index 879c0a1a2cc9c..fa9220f4fcd0f 100644
--- a/deepspeed/runtime/compiler.py
+++ b/deepspeed/runtime/compiler.py
@@ -5,6 +5,15 @@
 
 import torch
 
+try:
+    from torch.compiler import is_compiling as torch_is_compiling
+except ImportError:
+    try:
+        from torch._dynamo.external_utils import is_compiling as torch_is_compiling
+    except ImportError:
+        # Torch does not have compiler support
+        torch_is_compiling = lambda: False
+
 
 def is_compile_supported():
     return hasattr(torch, "compiler") and hasattr(torch.nn.Module, "compile")
@@ -14,3 +23,7 @@ def disable(func):
     if is_compile_supported():
         return torch.compiler.disable(func)
     return func
+
+
+def is_compiling():
+    return torch_is_compiling()
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index 8be2f7ac40557..fb786f29722d7 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -1012,8 +1012,8 @@ def _do_error_check(self):
             self.gradient_accumulation_steps), "DeepSpeedConfig: {} is not defined".format(GRADIENT_ACCUMULATION_STEPS)
 
         if self.zero_enabled:
-            assert (self.zero_optimization_stage <=
-                    ZeroStageEnum.max_stage), "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(
+            assert (self.zero_optimization_stage
+                    <= ZeroStageEnum.max_stage), "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(
                         ZeroStageEnum.max_stage)
 
         if self.fp16_master_weights_and_gradients:
diff --git a/deepspeed/runtime/domino/__init__.py b/deepspeed/runtime/domino/__init__.py
new file mode 100644
index 0000000000000..208299fb8c50f
--- /dev/null
+++ b/deepspeed/runtime/domino/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/deepspeed/runtime/domino/transformer.py b/deepspeed/runtime/domino/transformer.py
new file mode 100644
index 0000000000000..8eb95e49c29d0
--- /dev/null
+++ b/deepspeed/runtime/domino/transformer.py
@@ -0,0 +1,518 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+import deepspeed
+from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
+
+
+def is_rank_0():
+    if dist.get_rank() == 0:
+        return True
+
+
+class DominoModule(torch.nn.Module):
+    """extensions of torch Module."""
+
+    def __init__(self, ):
+        super(DominoModule, self).__init__()
+
+
+import enum
+
+
+class LayerType(enum.Enum):
+    encoder = 1
+    decoder = 2
+
+
+class AttnType(enum.Enum):
+    self_attn = 1
+    cross_attn = 2
+
+
+class AttnMaskType(enum.Enum):
+    padding = 1
+    causal = 2
+
+
+class ModelType(enum.Enum):
+    encoder_or_decoder = 1
+    encoder_and_decoder = 2
+
+
+handle_dic = {}
+
+
+def no_oper(input_, dic_, h_id):
+    return NoOper.apply(input_, dic_, h_id)
+
+
+class NoOper(torch.autograd.Function):
+
+    @staticmethod
+    def symbolic(graph, input_, handle_dic, h_id):
+        return input_
+
+    @staticmethod
+    def forward(ctx, input_, handle_dic, h_id):
+        ctx.handle_dic = handle_dic
+        ctx.h_id = h_id
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        handle = ctx.handle_dic[ctx.h_id]
+        handle.wait()
+        return grad_output, None, None
+
+
+def copy_to_tensor_model_parallel_region_a(mpu, input_, dic_, h_id):
+    return _CopyToModelParallelRegionA.apply(mpu, input_, dic_, h_id)
+
+
+class _CopyToModelParallelRegionA(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, mpu, input_, handle_dic, h_id):
+        return input_
+
+    @staticmethod
+    def forward(ctx, mpu, input_, handle_dic, h_id):
+        ctx.mpu = mpu
+        ctx.handle_dic = handle_dic
+        ctx.h_id = h_id
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        # Bypass the function if we are using only 1 GPU.
+        if ctx.mpu.get_tensor_model_parallel_world_size() == 1:
+            return grad_output
+
+        # Async All-reduce.
+        handle = deepspeed.comm.all_reduce(grad_output, group=ctx.mpu.get_tensor_model_parallel_group(), async_op=True)
+        ctx.handle_dic[ctx.h_id] = handle
+        return None, grad_output, None, None
+
+
+class CoreAttention(DominoModule):
+
+    def __init__(self, config, layer_number, mpu, attn_mask_type=AttnMaskType.causal):
+        super(CoreAttention, self).__init__()
+
+        self.layer_number = max(1, layer_number)
+        self.att_dropout_p = config.attention_dropout
+        self.is_causal = True
+        projection_size = config.kv_channels * config.num_attention_heads
+        world_size = mpu.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_partition = projection_size // world_size
+
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        # attn_mask is None when is_causal=True
+        context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer,
+                                                                         key_layer,
+                                                                         value_layer,
+                                                                         attn_mask=None,
+                                                                         dropout_p=self.att_dropout_p,
+                                                                         is_causal=True,
+                                                                         scale=None)
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer
+
+
+class ShardedAttention(DominoModule):
+    """Sharded self-attention layer class.
+    Only support self attention and causal attention mask
+    """
+
+    def __init__(self,
+                 config,
+                 layer_number,
+                 mpu,
+                 ColumnParallelLinear,
+                 RowParallelLinearNoComm,
+                 apply_rotary_pos_emb,
+                 attention_type=AttnType.self_attn,
+                 attn_mask_type=AttnMaskType.causal):
+        super(ShardedAttention, self).__init__()
+        self.layer_number = max(1, layer_number)
+        self.attention_type = attention_type
+        self.attn_mask_type = attn_mask_type
+        self.params_dtype = config.params_dtype
+        self.apply_rotary_pos_emb = apply_rotary_pos_emb
+
+        query_projection_size = config.kv_channels * config.num_attention_heads
+        kv_projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        world_size = mpu.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_attention_head = query_projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads // world_size
+
+        self.query_key_value = ColumnParallelLinear(config.hidden_size,
+                                                    query_projection_size + 2 * kv_projection_size,
+                                                    config=config,
+                                                    init_method=config.init_method,
+                                                    bias=config.add_bias_linear,
+                                                    gather_output=False)
+
+        self.core_attention = CoreAttention(config, self.layer_number, mpu, self.attn_mask_type)
+
+        self.dense = RowParallelLinearNoComm(query_projection_size,
+                                             config.hidden_size,
+                                             config=config,
+                                             init_method=config.output_layer_init_method,
+                                             bias=config.add_bias_linear,
+                                             input_is_parallel=True,
+                                             skip_bias_add=True)
+
+    def forward(self, hidden_states, attention_mask, rotary_pos_emb=None):
+        # hidden_states: [s, b, h]
+
+        # Query, Key, and Value
+        # Attention heads [s, b, h] --> [s, b, np * 3 * hn)]
+        mixed_x_layer, _ = self.query_key_value(hidden_states)
+
+        # [s, b, np * 3 * hn] --> [s, b, np, 3 * hn]
+        new_tensor_shape = mixed_x_layer.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            3 * self.hidden_size_per_attention_head,
+        )
+        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+        # [s, b, np, 3 * hn] -> [b, np, s, 3*hn]
+        mixed_x_layer = mixed_x_layer.permute(1, 2, 0, 3).contiguous()
+
+        # [s, b, np, 3 * hn] --> [s, b, np, hn], [s, b, np, hn], [s, b, np, hn]
+        (query_layer, key_layer, value_layer) = torch.split(mixed_x_layer, [
+            self.hidden_size_per_attention_head, self.hidden_size_per_attention_head,
+            self.hidden_size_per_attention_head
+        ],
+                                                            dim=3)
+        # [s, b, np, np * hn] -> [s, b, np, hn]
+        query_layer = query_layer.view(query_layer.size(0), query_layer.size(1), -1,
+                                       self.hidden_size_per_attention_head)
+
+        # apply rotary embedding
+        if rotary_pos_emb is not None:
+            if isinstance(rotary_pos_emb, tuple):
+                rotary_pos_emb = rotary_pos_emb
+            else:
+                rotary_pos_emb = ((rotary_pos_emb, ) * 2)
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+            query_layer = self.apply_rotary_pos_emb(query_layer, q_pos_emb)
+            key_layer = self.apply_rotary_pos_emb(key_layer, k_pos_emb)
+
+        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
+
+        # Output. [s, b, h]
+        output, bias = self.dense(context_layer)
+
+        return output, bias
+
+
+class DominoTransformerLayer(DominoModule):
+    """A domino single transformer layer.
+    [s, b, h] -> [s, b, h]
+    """
+
+    def __init__(self,
+                 config,
+                 layer_number,
+                 mpu,
+                 fused_layer_norm,
+                 _initialize_affine_weight_gpu,
+                 ColumnParallelLinear,
+                 RowParallelLinearNoComm,
+                 apply_rotary_pos_emb,
+                 bias_dropout_add_fused_train,
+                 bias_dropout_add_fused_inference,
+                 skip_bias_add=True,
+                 layer_type=LayerType.encoder,
+                 self_attn_mask_type=AttnMaskType.causal,
+                 drop_path_rate=0.,
+                 output_bias=None):
+        super(DominoTransformerLayer, self).__init__()
+
+        self.llama_model = config.llama_model
+        self.layer_number = layer_number
+        self.layer_type = layer_type
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+        self.bias_dropout_add_fused_train = bias_dropout_add_fused_train
+        self.bias_dropout_add_fused_inference = bias_dropout_add_fused_inference
+        self.mpu = mpu
+        self.output_bias = output_bias
+
+        # Layernorm on the input data.
+        self.input_layernorm = fused_layer_norm(config.hidden_size,
+                                                eps=config.layernorm_epsilon,
+                                                no_persist_layer_norm=config.no_persist_layer_norm)
+
+        # Self attention.
+        self.self_attention = ShardedAttention(config,
+                                               layer_number,
+                                               mpu,
+                                               ColumnParallelLinear,
+                                               RowParallelLinearNoComm,
+                                               apply_rotary_pos_emb,
+                                               attention_type=AttnType.self_attn,
+                                               attn_mask_type=self_attn_mask_type)
+
+        self.hidden_dropout = config.hidden_dropout
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = fused_layer_norm(config.hidden_size,
+                                                         eps=config.layernorm_epsilon,
+                                                         no_persist_layer_norm=config.no_persist_layer_norm)
+
+        # ------------ init mlp start ------------
+        init_method = config.init_method
+        output_layer_init_method = config.output_layer_init_method
+        self.add_bias = config.add_bias_linear
+        self.skip_bias_add = skip_bias_add
+
+        ffn_hidden_size = config.ffn_hidden_size
+        if config.gated_linear_unit:
+            ffn_hidden_size *= 2
+        self.output_size_c = config.ffn_hidden_size
+        self.input_size_c = config.hidden_size
+        self.input_size_r = config.ffn_hidden_size
+        self.output_size_r = self.input_size_c
+
+        world_size = mpu.get_tensor_model_parallel_world_size()
+        self.output_size_per_partition = self.output_size_c // world_size
+        self.input_size_per_partition = self.input_size_r // world_size
+
+        # Initialize weight.
+        self.weight_c = Parameter(
+            torch.empty(self.output_size_per_partition,
+                        self.input_size_c,
+                        device=get_accelerator().current_device_name(),
+                        dtype=config.params_dtype))
+        self.weight_r = Parameter(
+            torch.empty(self.output_size_r,
+                        self.input_size_per_partition,
+                        device=get_accelerator().current_device_name(),
+                        dtype=config.params_dtype))
+
+        if config.perform_initialization:
+            _initialize_affine_weight_gpu(self.weight_c, init_method, partition_dim=0, stride=1)
+
+            _initialize_affine_weight_gpu(self.weight_r, output_layer_init_method, partition_dim=1, stride=1)
+
+        if self.add_bias:
+            self.bias_c = Parameter(
+                torch.empty(self.output_size_per_partition,
+                            device=get_accelerator().current_device_name(),
+                            dtype=config.params_dtype))
+            self.bias_r = Parameter(
+                torch.empty(self.output_size_r,
+                            device=get_accelerator().current_device_name(),
+                            dtype=config.params_dtype))
+            if config.perform_initialization:
+                with torch.no_grad():
+                    self.bias_c.zero_()
+                    self.bias_r.zero_()
+        else:
+            self.register_parameter('bias_c', None)
+            self.register_parameter('bias_r', None)
+
+        if config.swiglu:
+
+            def swiglu(x):
+                x = torch.chunk(x, 2, dim=-1)
+                return F.silu(x[0]) * x[1]
+
+            self.mlp_activation_func = swiglu
+        else:
+            self.mlp_activation_func = F.gelu
+        # ------------ init mlp end ------------
+
+    def forward(self, hidden_states, attention_mask, rotary_pos_emb=None):
+        # hidden_states: [s, b, h]
+        hidden_states0, hidden_states1 = hidden_states
+
+        layernorm_output0 = self.input_layernorm(hidden_states0)
+        layernorm_output1 = self.input_layernorm(hidden_states1)
+
+        if not self.llama_model:
+            rotary_pos_emb = None
+
+        attention_output0, attention_bias0  = \
+            self.self_attention(
+                layernorm_output0,
+                attention_mask,
+                rotary_pos_emb=rotary_pos_emb)
+        handle0 = deepspeed.comm.all_reduce(attention_output0,
+                                            group=self.mpu.get_tensor_model_parallel_group(),
+                                            async_op=True)
+
+        attention_output1, attention_bias1 = \
+            self.self_attention(
+            layernorm_output1,
+            attention_mask,
+            rotary_pos_emb=rotary_pos_emb)
+        handle1 = deepspeed.comm.all_reduce(attention_output1,
+                                            group=self.mpu.get_tensor_model_parallel_group(),
+                                            async_op=True)
+        handle0.wait()
+
+        # Residual0 connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual0 = layernorm_output0
+        else:
+            residual0 = hidden_states0
+
+        if self.training:
+            bias_dropout_add_func = self.bias_dropout_add_fused_train
+        else:
+            bias_dropout_add_func = self.bias_dropout_add_fused_inference
+        if attention_bias0 is not None:
+            attention_bias0 = attention_bias0.expand_as(residual0)
+        layernorm_input0 = bias_dropout_add_func(attention_output0, attention_bias0, residual0, self.hidden_dropout)
+
+        layernorm_output0 = self.post_attention_layernorm(layernorm_input0)
+        layernorm_output0 = no_oper(layernorm_output0, handle_dic, f'{self.layer_number}_0')
+
+        # Residual1 connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual1 = layernorm_output1
+        else:
+            residual1 = hidden_states1
+
+        if attention_bias1 is not None:
+            attention_bias1 = attention_bias1.expand_as(residual1)
+        layernorm_input1 = bias_dropout_add_func(attention_output1, attention_bias1, residual1, self.hidden_dropout)
+
+        layernorm_output1 = self.post_attention_layernorm(layernorm_input1)
+        layernorm_output1 = no_oper(layernorm_output1, handle_dic, f'{self.layer_number}_1')
+
+        # ------------ explicit mlp start ------------
+        bias_c = self.bias_c if not self.skip_bias_add else None
+
+        input0 = copy_to_tensor_model_parallel_region_a(self.mpu, layernorm_output0, handle_dic,
+                                                        f'{self.layer_number}_0')
+        # Batch0 Matrix multiply.
+        output0 = torch.matmul(input0, self.weight_c.t())
+        if bias_c is not None:
+            output0 = output0 + bias_c
+        output0 = self.mlp_activation_func(output0)
+        output0 = torch.matmul(output0, self.weight_r.t())
+        handle2 = deepspeed.comm.all_reduce(output0, group=self.mpu.get_tensor_model_parallel_group(), async_op=True)
+
+        handle1.wait()
+
+        input1 = copy_to_tensor_model_parallel_region_a(self.mpu, layernorm_output1, handle_dic,
+                                                        f'{self.layer_number}_1')
+        # Batch1 Matrix multiply.
+        output1 = torch.matmul(input1, self.weight_c.t())
+        output1 = self.mlp_activation_func(output1)
+        if bias_c is not None:
+            output1 = output1 + bias_c
+        output1 = torch.matmul(output1, self.weight_r.t())
+        deepspeed.comm.all_reduce(output1, group=self.mpu.get_tensor_model_parallel_group())
+
+        handle2.wait()
+
+        output0 = output0 + self.bias_r if self.bias_r is not None else output0
+        output1 = output1 + self.bias_r if self.bias_r is not None else output1
+        output_bias = self.output_bias
+        mlp_output0, mlp_output1, mlp_bias0, mlp_bias1 = output0, output1, output_bias, output_bias
+        # ------------ explicit mlp end ------------
+
+        if self.apply_residual_connection_post_layernorm:
+            residual0 = layernorm_output0
+            residual1 = layernorm_output1
+        else:
+            residual0 = layernorm_input0
+            residual1 = layernorm_input1
+
+        if mlp_bias0 is not None:
+            mlp_bias0 = mlp_bias0.expand_as(residual0)
+            mlp_bias1 = mlp_bias1.expand_as(residual1)
+        output0 = bias_dropout_add_func(mlp_output0, mlp_bias0, residual0, self.hidden_dropout)
+        output1 = bias_dropout_add_func(mlp_output1, mlp_bias1, residual1, self.hidden_dropout)
+
+        return output0, output1
+
+
+class DominoTransformer(DominoModule):
+    """Transformer class."""
+
+    def __init__(self,
+                 config,
+                 model_type,
+                 mpu,
+                 fused_layer_norm,
+                 _initialize_affine_weight_gpu,
+                 ColumnParallelLinear,
+                 RowParallelLinearNoComm,
+                 apply_rotary_pos_emb,
+                 bias_dropout_add_fused_train,
+                 bias_dropout_add_fused_inference,
+                 layer_type=LayerType.encoder,
+                 self_attn_mask_type=AttnMaskType.causal,
+                 pre_process=True,
+                 post_process=True,
+                 post_layer_norm=True,
+                 drop_path_rate=0.0):
+        super(DominoTransformer, self).__init__()
+
+        self.layer_type = layer_type
+        self.model_type = model_type
+        self.post_process = post_process
+        self.post_layer_norm = post_layer_norm
+        self.num_layers = config.num_layers
+        self.drop_path_rate = drop_path_rate
+        self.drop_path_rates = [rate.item() for rate in torch.linspace(0, self.drop_path_rate, config.num_layers)]
+
+        def build_layer(layer_number):
+            return DominoTransformerLayer(config,
+                                          layer_number,
+                                          mpu,
+                                          fused_layer_norm,
+                                          _initialize_affine_weight_gpu,
+                                          ColumnParallelLinear,
+                                          RowParallelLinearNoComm,
+                                          apply_rotary_pos_emb,
+                                          bias_dropout_add_fused_train,
+                                          bias_dropout_add_fused_inference,
+                                          layer_type=layer_type,
+                                          self_attn_mask_type=self_attn_mask_type,
+                                          drop_path_rate=self.drop_path_rates[layer_number - 1])
+
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
+
+        if self.post_process and self.post_layer_norm:
+            self.final_layernorm = fused_layer_norm(config.hidden_size,
+                                                    eps=config.layernorm_epsilon,
+                                                    no_persist_layer_norm=config.no_persist_layer_norm)
+
+    def forward(self, hidden_states, attention_mask, rotary_pos_emb=None):
+        # hidden_states: [s, b, h]
+
+        for index in range(self.num_layers):
+            layer = self.layers[index]
+            hidden_states = layer(hidden_states, attention_mask, rotary_pos_emb)
+
+        hidden_states0, hidden_states1 = hidden_states
+        if self.post_process and self.post_layer_norm:
+            hidden_states0 = self.final_layernorm(hidden_states0)
+            hidden_states1 = self.final_layernorm(hidden_states1)
+
+        return hidden_states0, hidden_states1
diff --git a/deepspeed/runtime/eigenvalue.py b/deepspeed/runtime/eigenvalue.py
index 36300eb904ddc..a82d8b1d5c7a0 100755
--- a/deepspeed/runtime/eigenvalue.py
+++ b/deepspeed/runtime/eigenvalue.py
@@ -114,8 +114,8 @@ def compute_eigenvalue(self, module, device=None, scale=1.0):
             eigenvalue_current, eigenvalue_previous = 1., 0.
 
             while (i < self.max_iter) and abs(eigenvalue_current) > 0 and (abs(
-                (eigenvalue_current - eigenvalue_previous) / eigenvalue_current) >=
-                                                                           self.tol):  # test convergence criteria
+                (eigenvalue_current - eigenvalue_previous) / eigenvalue_current)
+                                                                           >= self.tol):  # test convergence criteria
                 eigenvalue_previous = eigenvalue_current
 
                 Hv = torch.autograd.grad(grads, params, grad_outputs=v, only_inputs=True, retain_graph=True)
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 05bb23e8ddd94..8c5da36e5a787 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -17,6 +17,7 @@
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from contextlib import contextmanager
 
 from typing import Callable, Dict, Union, Iterable, Container
 
@@ -216,6 +217,7 @@ def __init__(self,
         self.loaded_checkpoint_mp_world_size = None
         self.loaded_checkpoint_dp_world_size = None
         self.enable_backward_allreduce = True
+        self.inside_no_sync_ctxt = False
         self.progressive_layer_drop = None
         self.eigenvalue = None
         self.block_eigenvalue = None
@@ -811,6 +813,9 @@ def zero_max_reuse_distance(self):
     def zero_prefetch_bucket_size(self):
         return self._config.zero_config.prefetch_bucket_size
 
+    def zero_module_granularity_threshold(self):
+        return self._config.zero_config.module_granularity_threshold
+
     def zero_param_persistence_threshold(self):
         return self._config.zero_config.param_persistence_threshold
 
@@ -1077,7 +1082,10 @@ def _supported_optims(self):
     # Validate configuration based on command line arguments
     def _do_sanity_check(self):
         if self.fp16_enabled() and not get_accelerator().is_fp16_supported():
-            raise ValueError("Type fp16 is not supported.")
+            raise ValueError("Type fp16 is not supported on your device.")
+
+        if self.bfloat16_enabled() and not get_accelerator().is_bf16_supported():
+            raise ValueError("Type bf16 is not supported on your device.")
 
         expected_optim_types = self._supported_optims()
         expected_optim_types += [type(None), Callable]
@@ -1611,6 +1619,7 @@ def _configure_zero_optimizer(self, optimizer):
                     zero_param_parallel_group=zero_param_parallel_group,
                     zero_quantized_weights=self.zero_quantized_weights(),
                     zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights(),
+                    zero_module_granularity_threshold=self.zero_module_granularity_threshold(),
                 )
             else:
                 log_dist(
@@ -1657,6 +1666,7 @@ def _configure_zero_optimizer(self, optimizer):
                     zero_hpz_partition_size=self.zero_hpz_partition_size(),
                     zero_quantized_weights=self.zero_quantized_weights(),
                     zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights(),
+                    zero_module_granularity_threshold=self.zero_module_granularity_threshold(),
                 )
 
         else:
@@ -1973,12 +1983,31 @@ def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
                 grads = None
                 self.buffered_allreduce_fallback(grads=grads, elements_per_buffer=bucket_size)
 
+    @contextmanager
+    def no_sync(self):
+        r"""
+            Context manager to disable gradient reduction during backward pass.
+            This context manager has the following effects on other DeepSpeed features.
+            1. Incompatible with ZeRO stage 2/3 which rely on reduction for gradient partitioning.
+            2. It is illegal to  call engine.step() within the context manager.
+            3. Tracking of gradient accumulation steps is disabled.
+        """
+        assert not self.zero_optimization_partition_gradients(), \
+        f"no_sync context manager is incompatible with gradient partitioning logic of ZeRO stage {self.zero_optimization_stage()}"
+
+        assert not self.inside_no_sync_ctxt, f"no_sync context manager reentry is unsupported"
+
+        self.inside_no_sync_ctxt = True
+        try:
+            yield
+        finally:
+            self.inside_no_sync_ctxt = False
+
     @instrument_w_nvtx
-    def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_graph=False, scale_wrt_gas=True):
+    def backward(self, loss, release_loss=False, retain_graph=False, scale_wrt_gas=True):
         r"""Execute backward pass on the loss
         Arguments:
             loss: Torch tensor on which to execute backward propagation
-            allreduce_gradients: is deprecated, ignored, and will soon be removed'
             retain_graph: bool, default: false
                 forward on user defined choice of retain_graph
         """
@@ -1988,11 +2017,10 @@ def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_gr
         if self.scale_wrt_gas is not None:
             scale_wrt_gas = self.scale_wrt_gas
 
-        if not allreduce_gradients:
-            logger.warning(f"Argument `allreduce_gradients` is deprecated, ignored, and will soon be removed")
+        do_gradient_reduction = self.enable_backward_allreduce and not self.inside_no_sync_ctxt
 
-        # scale loss w.r.t. gradient accumulation if needed
-        if self.gradient_accumulation_steps() > 1 and scale_wrt_gas:
+        # scale loss w.r.t. gradient accumulation if reduction is not disabled
+        if do_gradient_reduction and self.gradient_accumulation_steps() > 1 and scale_wrt_gas:
             loss = self._scale_loss_by_gas(loss.float())
 
         # Log training loss
@@ -2041,7 +2069,7 @@ def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_gr
 
         self._start_timers(self.engine_timers.backward_reduce_timers)
 
-        if allreduce_gradients and self.enable_backward_allreduce:
+        if do_gradient_reduction:
             # Traditional code path that allreduces the module parameter grads
             self.allreduce_gradients()
 
@@ -2177,6 +2205,9 @@ def step(self, lr_kwargs=None):
         r"""Execute the weight update step after forward and backward propagation
         on effective_train_batch.
         """
+        assert not self.inside_no_sync_ctxt, \
+        "It is illegal to call Engine.step() inside no_sync context manager"
+
         see_memory_usage("Engine before step", force=self.memory_breakdown())
 
         # Check early because self.global_steps is incremented at some point here.
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 7ebf5487cf9ed..deb44c2e71eb9 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -287,7 +287,8 @@ def _exec_reduce_tied_grads(self):
         weight_group_list = self.module.get_tied_weights_and_groups()
         for weight, group in weight_group_list:
             grad = weight._hp_grad if self.using_bf16_optimizer else weight.grad
-            dist.all_reduce(grad, group=group)
+            if grad is not None:
+                dist.all_reduce(grad, group=group)
 
     def _exec_reduce_grads(self):
         self._force_grad_boundary = True
@@ -639,9 +640,10 @@ def _aggregate_total_loss(self):
             self.dp_group_loss = losses[0].clone().detach()
             agg_loss = losses[1].clone().detach()
             if additional_losses is not None:
-                self.agg_additional_losses = OrderedDict(
-                    {name: losses[2 + i].clone().detach()
-                     for i, name in enumerate(additional_losses.keys())})
+                self.agg_additional_losses = OrderedDict({
+                    name: losses[2 + i].clone().detach()
+                    for i, name in enumerate(additional_losses.keys())
+                })
         return agg_loss
 
     def set_dataloader(self, loader):
diff --git a/deepspeed/runtime/swap_tensor/utils.py b/deepspeed/runtime/swap_tensor/utils.py
index 90b2d9b8bd31e..1f9825c34638b 100644
--- a/deepspeed/runtime/swap_tensor/utils.py
+++ b/deepspeed/runtime/swap_tensor/utils.py
@@ -18,12 +18,12 @@
 
 def swap_in_tensors(swap_handle, tensor_buffers, swap_paths):
     for buffer, path in zip(tensor_buffers, swap_paths):
-        assert (swap_handle.async_pread(buffer, path) == 0)
+        assert (swap_handle.async_pread(buffer, path, 0) == 0)
 
 
 def swap_out_tensors(swap_handle, tensor_buffers, swap_paths):
     for buffer, path in zip(tensor_buffers, swap_paths):
-        assert (swap_handle.async_pwrite(buffer, path) == 0)
+        assert (swap_handle.async_pwrite(buffer, path, 0) == 0)
 
 
 def print_object(obj, name, exclude_list=[]):
diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index b9617d3e632fd..f48adb58c9bff 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -257,8 +257,8 @@ def has_overflow(self, params, has_moe_params=None):
         elif self.mpu is not None:
             if self.deepspeed is not None:
                 using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
-                if (using_pipeline and self.deepspeed.pipeline_enable_backward_allreduce is False) or (
-                        not using_pipeline and self.deepspeed.enable_backward_allreduce is False):
+                if (using_pipeline and self.deepspeed.pipeline_enable_backward_allreduce
+                        is False) or (not using_pipeline and self.deepspeed.enable_backward_allreduce is False):
                     dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_data_parallel_group())
             dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_model_parallel_group())
         elif self.deepspeed is not None and self.deepspeed.enable_backward_allreduce is False:
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index 1cfcd784e2ceb..19b272ce9e92b 100644
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -21,6 +21,7 @@
     "stage3_max_live_parameters" : 1000000000,
     "stage3_max_reuse_distance" : 1000000000,
     "stage3_use_all_reduce_for_fetch_params": [true|false],
+    "stage3_module_granularity_threshold": 0,
     "allgather_partitions": [true|false],
     "use_multi_rank_bucket_allreduce": [true|false],
     "allgather_bucket_size": 500000000,
@@ -245,6 +246,14 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     this option is enabled and then saves the fp16 model weights.
     """
 
+    module_granularity_threshold: int = Field(pp_int(0), alias="stage3_module_granularity_threshold")
+    """
+    The granularity of a module is determined by the ratio of "parameter_count / (1 + descendant count)".
+    ZeRO3 classifies modules with a granularity below the threshold as fine-grained,
+    which are treated as integral units during parameter fetching. This reduces host overhead
+    and the separate allgather overhead introduced by hooks for fine-grained layers when fetching parameters.
+    """
+
     use_all_reduce_for_fetch_params: bool = Field(False, alias="stage3_use_all_reduce_for_fetch_params")
     """
     Use all_reduce op when fetching module parameters at stage3. This improves performance by reducing
@@ -302,7 +311,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     for efficient all_2_all_reduce comm
     """
 
-    mics_shard_size: int = Field(-1, new_param="mics_shard_size")
+    mics_shard_size: int = Field(-1, json_schema_extra={"new_param": "mics_shard_size"})
 
     mics_hierarchical_params_gather: bool = False
 
diff --git a/deepspeed/runtime/zero/parameter_offload.py b/deepspeed/runtime/zero/parameter_offload.py
index 1ce2414a1e17c..f945f51661900 100644
--- a/deepspeed/runtime/zero/parameter_offload.py
+++ b/deepspeed/runtime/zero/parameter_offload.py
@@ -6,7 +6,7 @@
 import sys
 import torch
 from collections import OrderedDict
-from deepspeed.utils import z3_leaf_module
+from deepspeed.utils import z3_leaf_module, set_z3_leaf_module
 from deepspeed.runtime.utils import see_memory_usage
 from deepspeed.runtime.zero.utils import apply_to_tensors_only, is_zero_param
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
@@ -14,6 +14,7 @@
 from deepspeed.runtime.zero.partition_parameters import *
 from deepspeed.runtime.zero.partitioned_param_coordinator import PartitionedParameterCoordinator, InflightParamRegistry, iter_params
 from deepspeed.accelerator import get_accelerator
+from deepspeed import utils
 
 FWD_MODULE_STACK = list()
 
@@ -38,7 +39,7 @@ def _apply_forward_and_backward_to_tensors_only(module, forward_function, backwa
 
 class ZeROOrderedDict(OrderedDict):
 
-    def __init__(self, parent_module=None, *args, **kwargs):
+    def __init__(self, parent_module, *args, **kwargs):
         """A replacement for ``collections.OrderedDict`` to detect external ZeRO params.
 
         Args:
@@ -49,6 +50,10 @@ def __init__(self, parent_module=None, *args, **kwargs):
         self._parent_module = parent_module
         self._in_forward = False
 
+    def __reduce__(self):
+        r0, _, *r2 = super().__reduce__()
+        return (r0, (self._parent_module, )) + r2
+
     def __getitem__(self, key):
         param = super().__getitem__(key)
 
@@ -56,6 +61,7 @@ def __getitem__(self, key):
         if param is None:
             return param
 
+        # TODO: only weaken this check during compilation
         if hasattr(param, "ds_status") and param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
             if self._parent_module._parameters._in_forward:
                 register_external_parameter(FWD_MODULE_STACK[-1], param)
@@ -96,6 +102,7 @@ def __init__(
         zero_param_parallel_group=None,
         zero_quantized_weights=False,
         zero_quantized_nontrainable_weights=False,
+        zero_module_granularity_threshold=0,
     ):
 
         see_memory_usage("DeepSpeedZeRoOffload initialize [begin]", force=True)
@@ -128,7 +135,6 @@ def __init__(
         self.persistent_parameters = self.mark_persistent_parameters(self.param_numel_persistence_threshold,
                                                                      self.model_persistence_threshold)
 
-        self.param_coordinators = {}
         self._prefetch_bucket_sz = int(prefetch_bucket_size)
         self._max_reuse_distance_in_numel = int(max_reuse_distance)
         self._max_available_parameters_in_numel = int(max_live_parameters)
@@ -136,14 +142,31 @@ def __init__(
         ) if overlap_comm else get_accelerator().default_stream()
 
         if not hasattr(module, "ds_inflight_param_registry"):
-            module.ds_inflight_param_registry = dict()
-            # we need two registries, one for training and one for eval. They will be used when creating PartitionedParameterCoordinator
-            module.ds_inflight_param_registry[True] = InflightParamRegistry()
-            module.ds_inflight_param_registry[False] = InflightParamRegistry()
+            module.ds_inflight_param_registry = InflightParamRegistry()
         self.__inflight_param_registry = module.ds_inflight_param_registry
 
+        self.param_coordinator = PartitionedParameterCoordinator(
+            prefetch_bucket_sz=self._prefetch_bucket_sz,
+            max_reuse_distance_in_numel=self._max_reuse_distance_in_numel,
+            max_available_parameters_in_numel=self._max_available_parameters_in_numel,
+            allgather_stream=self.__allgather_stream,
+            inflight_param_registry=self.__inflight_param_registry,
+            prefetch_nvme=self.offload_device == OffloadDeviceEnum.nvme,
+            timers=self.timers,
+            zero_quantized_weights=self.zero_quantized_weights,
+            zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights,
+        )
+
+        if zero_module_granularity_threshold > 0:
+            self.min_granularity_value = sys.maxsize
+            self.min_granularity_layer = None
+            self.granularity_info = set()
+            self.z3_leaf_layers = []
+            self._set_z3_leaf_modules_by_threshold(module, zero_module_granularity_threshold)
+
         self.forward_hooks = []
         self.backward_hooks = []
+
         self.setup_zero_stage3_hooks()
         print_rank_0(
             f'Created module hooks: forward = {len(self.forward_hooks)}, backward = {len(self.backward_hooks)}',
@@ -156,26 +179,13 @@ def partition_all_parameters(self):
         """Partitioning Parameters that were not partitioned usually if parameters
         of modules whose input parameters do not require grad computation do not
         trigger post call and will therefore will remain unpartitioned"""
-        self.get_param_coordinator(training=self.module.training).release_and_reset_all(self.module)
+        self.get_param_coordinator().release_and_reset_all(self.module)
         for param in iter_params(self.module, recurse=True):
             if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
                 raise RuntimeError(f"{param.ds_summary()} expected to be released")
 
-    def get_param_coordinator(self, training):
-        if not training in self.param_coordinators:
-            self.param_coordinators[training] = PartitionedParameterCoordinator(
-                prefetch_bucket_sz=self._prefetch_bucket_sz,
-                max_reuse_distance_in_numel=self._max_reuse_distance_in_numel,
-                max_available_parameters_in_numel=self._max_available_parameters_in_numel,
-                allgather_stream=self.__allgather_stream,
-                inflight_param_registry=self.__inflight_param_registry[training],
-                prefetch_nvme=self.offload_device == OffloadDeviceEnum.nvme,
-                timers=self.timers,
-                zero_quantized_weights=self.zero_quantized_weights,
-                zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights,
-            )
-
-        return self.param_coordinators[training]
+    def get_param_coordinator(self):
+        return self.param_coordinator
 
     def empty_partition_cache(self):
         self.partition_all_parameters()
@@ -223,14 +233,14 @@ def setup_zero_stage3_hooks(self):
 
         #reset step if in inference mode
         @instrument_w_nvtx
-        def _end_of_forward_hook(module, *args):
+        def _start_of_forward_hook(module, *args):
+
+            self.get_param_coordinator().reset_step()
 
-            if not torch._C.is_grad_enabled():
-                self.get_param_coordinator(training=False).reset_step()
+        self.module.register_forward_pre_hook(_start_of_forward_hook)
 
         #likely one of them should be enough but just to be safe
         self._register_hooks_recursively(self.module)
-        self.module.register_forward_hook(_end_of_forward_hook)
 
         # Add top module to stack trace
         global FWD_MODULE_STACK
@@ -382,7 +392,8 @@ def _run_before_forward_function(input):
                                                                _run_after_backward_hook, inputs)
 
         def _post_backward_module_hook(module, inputs):
-            module.ds_grads_remaining = 0
+            if not hasattr(module, "ds_grads_remaining"):
+                module.ds_grads_remaining = 0
 
             if not hasattr(module, "post_bwd_fn"):
 
@@ -442,7 +453,7 @@ def pre_sub_module_forward_function(self, sub_module):
         global FWD_MODULE_STACK
         FWD_MODULE_STACK.append(sub_module)
 
-        param_coordinator = self.get_param_coordinator(training=sub_module.training)
+        param_coordinator = self.get_param_coordinator()
         param_coordinator.trace_prologue(sub_module)
         if param_coordinator.is_record_trace():
             param_coordinator.record_module(sub_module)
@@ -455,7 +466,7 @@ def post_sub_module_forward_function(self, sub_module):
         see_memory_usage(f"After sub module function {sub_module.__class__.__name__} {sub_module.id} before release",
                          force=False)
 
-        param_coordinator = self.get_param_coordinator(training=sub_module.training)
+        param_coordinator = self.get_param_coordinator()
         param_coordinator.release_sub_module(sub_module)
 
         see_memory_usage(f"After sub module function {sub_module.__class__.__name__}  {sub_module.id} after release",
@@ -463,8 +474,8 @@ def post_sub_module_forward_function(self, sub_module):
 
     @torch.no_grad()
     def pre_sub_module_backward_function(self, sub_module):
-        assert sub_module.training, "backward pass is invalid for module in evaluation mode"
-        param_coordinator = self.get_param_coordinator(training=True)
+        # assert sub_module.training, "backward pass is invalid for module in evaluation mode"
+        param_coordinator = self.get_param_coordinator()
         param_coordinator.trace_prologue(sub_module)
         if param_coordinator.is_record_trace():
             param_coordinator.record_module(sub_module)
@@ -472,13 +483,92 @@ def pre_sub_module_backward_function(self, sub_module):
 
     @torch.no_grad()
     def post_sub_module_backward_function(self, sub_module):
-        assert sub_module.training, "backward pass is invalid for module in evaluation mode"
+        # assert sub_module.training, "backward pass is invalid for module in evaluation mode"
         see_memory_usage(
             f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} before release",
             force=False)
 
-        self.get_param_coordinator(training=True).release_sub_module(sub_module)
+        self.get_param_coordinator().release_sub_module(sub_module)
 
         see_memory_usage(
             f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} after release",
             force=False)
+
+    def _set_z3_leaf_modules_by_threshold(self, module, zero_module_granularity_threshold):
+
+        self._get_granularity_recursively(module)
+        print_rank_0(f"{'MODULE NAME'.ljust(30)}|{'GRANULARITY VALUE'.rjust(20)}", force=True)
+        for granularity in self.granularity_info:
+            print_rank_0(granularity, force=True)
+
+        if self.min_granularity_value <= zero_module_granularity_threshold:
+            self._set_leaf_by_threshold_preorder(module, zero_module_granularity_threshold)
+            utils.logger.info(
+                f"z3_leaf_module was set by stage3_module_granularity_threshold:{zero_module_granularity_threshold}")
+            for layer in self.z3_leaf_layers:
+                print_rank_0(f"{layer.__class__.__name__}:{layer.ds_model_granularity}", force=True)
+        else:
+            utils.logger.warning(
+                f"The smallest module granularity is [{self.min_granularity_layer}:{self.min_granularity_value}]. "\
+                f"To make stage3_module_granularity_threshold effective, you need to set stage3_module_granularity_threshold >= {self.min_granularity_value}. "\
+                f"Current Value:{zero_module_granularity_threshold}"
+            )
+
+    def _get_granularity_recursively(self, module):
+        """This function is used to recursively obtain the granularity of each module."""
+
+        # avoid setting as leaf for particularly large models, even if the granularity is very small
+        # an oversized leaf module increases the number of live parameters, introducing memory overhead
+        Z3_MAX_LEAF_SIZE = 1e9
+
+        if not list(module.parameters()):
+            # skip Modules without parameters, such as GELU, etc.
+            module.ds_model_granularity = sys.maxsize
+            return 0, 0
+
+        num_layers = 0
+        num_params = 0
+        num_params += sum(p.ds_numel for p in module.parameters(recurse=False))
+        if not any(module.children()):
+            # torch leaf module
+            module.ds_model_granularity = sys.maxsize
+            return 1, num_params
+
+        for child in module.children():
+            layers_in_child, params_in_child = self._get_granularity_recursively(child)
+            num_layers += layers_in_child
+            num_params += params_in_child
+
+        if module.__class__.__name__ in torch.nn.modules.container.__all__:
+            # Do not set container modules like ModuleList as leaf modules
+            # as this will prevent hooks from being set on their children
+            # and they may do not invoke the forward method
+            module.ds_model_granularity = sys.maxsize
+            return num_layers, num_params
+
+        num_layers += 1
+        ds_model_granularity = (num_params // num_layers) if num_params <= Z3_MAX_LEAF_SIZE else sys.maxsize
+        module.ds_model_granularity = ds_model_granularity
+        # module.ds_model_num_layers = num_layers
+        # module.ds_model_num_params = num_params
+        if self.min_granularity_value > ds_model_granularity:
+            self.min_granularity_value = ds_model_granularity
+            self.min_granularity_layer = module.__class__.__name__
+        self.granularity_info.add(f"{module.__class__.__name__.ljust(30)}|{str(ds_model_granularity).rjust(20)}")
+
+        return num_layers, num_params
+
+    def _set_leaf_by_threshold_preorder(self, module, granularity_treshhold):
+        '''Set modules as leaf modules based on the threshold, prioritizing parent nodes.'''
+
+        num_params = sum(p.ds_numel for p in module.parameters())
+        if num_params == 0:
+            # skip Modules without parameters, such as GELU, etc.
+            return
+        if module.ds_model_granularity <= granularity_treshhold:
+            set_z3_leaf_module(module, True)
+            self.z3_leaf_layers.append(module)
+            return
+
+        for sub_module in module.children():
+            self._set_leaf_by_threshold_preorder(sub_module, granularity_treshhold)
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 22a6746bb57cf..cb0cd7c8017db 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -1882,6 +1882,7 @@ def _allgather_params_coalesced(self, param_list, hierarchy=0, quantize=False):
 
         return None
 
+    @torch.no_grad()
     def _allgather_params(self, param_list, hierarchy=0):
         if len(param_list) == 0:
             return
diff --git a/deepspeed/runtime/zero/partitioned_param_coordinator.py b/deepspeed/runtime/zero/partitioned_param_coordinator.py
index 5780b2afd6def..596d0e9c20f93 100644
--- a/deepspeed/runtime/zero/partitioned_param_coordinator.py
+++ b/deepspeed/runtime/zero/partitioned_param_coordinator.py
@@ -18,6 +18,7 @@
 from deepspeed.utils.debug import debug_module2name_id, debug_param2name_id
 from deepspeed.accelerator import get_accelerator
 import deepspeed.runtime.compiler as compiler
+from deepspeed.runtime.compiler import is_compiling
 
 import logging
 
@@ -92,7 +93,7 @@ def __init__(
         # keeps track of the number of submodules invoked so far.
         self.__step_id: int = 0
         # network tracing mode
-        self.__trace_mode: ZeRoTraceMode = ZeRoTraceMode.RECORD
+        self.__trace_mode: ZeRoTraceMode = ZeRoTraceMode.INVALID
         # sequence of submodules/parameters in forward pass + backward pass
         self.__submodule_order: Iterable[Module] = []
         self.__param_order: Iterable[__class__.__ParamInTrace] = []
@@ -188,6 +189,9 @@ def trace_prologue(self, sub_module: Module) -> None:
     @compiler.disable
     def record_module(self, sub_module: Module) -> None:
         """adds sub module to trace"""
+        if is_compiling():
+            return
+
         if not self.is_record_trace():
             raise RuntimeError(f"attempted to record trace when status = {self.__trace_mode}")
 
@@ -195,6 +199,8 @@ def record_module(self, sub_module: Module) -> None:
         self.__step_id_module_fetched_for[sub_module.id].append(self.__step_id)
 
     def record_parameters(self, sub_module: Module) -> None:
+        if is_compiling():
+            return
         """adds sub module to trace"""
         if not self.is_record_trace():
             raise RuntimeError(f"attempted to record trace when status = {self.__trace_mode}")
@@ -209,8 +215,12 @@ def construct_parameter_trace_from_module_trace(self):
         for sub_module in self.__submodule_order:
             self.record_parameters(sub_module)
 
+    @compiler.disable
     def reset_step(self) -> None:
         """indicate that we have completed one fwd+bwd for the model"""
+        if is_compiling():
+            return
+
         self._clean_inflight_param_registry()
 
         if not self.is_complete_trace():  # not self.trace_complete:
@@ -242,7 +252,6 @@ def reset_step(self) -> None:
         self.__most_recent_step_id_param_fetched_for = collections.defaultdict(lambda: int(-1e10))
         self.__step_id_module_fetched_for = collections.defaultdict(lambda: collections.deque())
         self.__step_id = 0
-        self.__n_available_params = 0
         self.__profiler.reset_events()
 
     def _dump_params(self, tag, sub_module, params, step_id=None):
@@ -420,7 +429,7 @@ def release_and_reset_all(self, module: Module) -> None:
             # there's a hook execution issue
             param.ds_active_sub_modules.clear()
             self.__release_param(param)
-
+        self.__n_available_params = 0
         for param in iter_params(module, recurse=True):
             if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
                 raise RuntimeError(f"{param.ds_summary()} expected to be released")
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index e2c273fd913fb..99a5ecf41a2fc 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -157,6 +157,7 @@ def __init__(
         zero_hpz_partition_size=1,
         zero_quantized_weights=False,
         zero_quantized_nontrainable_weights=False,
+        zero_module_granularity_threshold=0,
     ):
         see_memory_usage("Stage 3 initialize beginning", force=True)
 
@@ -227,7 +228,8 @@ def __init__(
             mpu=mpu,
             zero_param_parallel_group=zero_param_parallel_group,
             zero_quantized_weights=zero_quantized_weights,
-            zero_quantized_nontrainable_weights=zero_quantized_nontrainable_weights)
+            zero_quantized_nontrainable_weights=zero_quantized_nontrainable_weights,
+            zero_module_granularity_threshold=zero_module_granularity_threshold)
 
         self.persistent_parameters = self.parameter_offload.persistent_parameters
         self._configure_offloading(offload_optimizer_config, offload_param_config)
@@ -458,6 +460,7 @@ def initialize_ds_offload(
         zero_param_parallel_group,
         zero_quantized_weights,
         zero_quantized_nontrainable_weights,
+        zero_module_granularity_threshold,
     ):
         return DeepSpeedZeRoOffload(module=module,
                                     timers=timers,
@@ -473,7 +476,8 @@ def initialize_ds_offload(
                                     mpu=mpu,
                                     zero_param_parallel_group=zero_param_parallel_group,
                                     zero_quantized_weights=zero_quantized_weights,
-                                    zero_quantized_nontrainable_weights=zero_quantized_nontrainable_weights)
+                                    zero_quantized_nontrainable_weights=zero_quantized_nontrainable_weights,
+                                    zero_module_granularity_threshold=zero_module_granularity_threshold)
 
     def _get_trainable_parameter_groups(self):
         param_groups = []
@@ -538,10 +542,15 @@ def _setup_for_real_optimizer(self):
             self.grad_partitions_flat_buffer = get_accelerator().pin_memory(self.grad_partitions_flat_buffer)
 
         offset = 0
+        max_partition_numel = 0
         for param in all_params:
             self.__param_id_to_grad_partition[param.ds_id] = self.grad_partitions_flat_buffer.narrow(
                 0, offset, param.partition_numel())
             offset += param.partition_numel()
+            max_partition_numel = max(max_partition_numel, param.partition_numel())
+        if self.offload_optimizer:
+            self.pinned_grad_buffer: Tensor = get_accelerator().pin_memory(
+                torch.empty(max_partition_numel, device=self.device))
 
     def _link_all_hp_params(self):
         for p in self.module.parameters():
@@ -588,8 +597,8 @@ def defragment(tensors: List[Tensor]) -> Tensor:
 
         return device_buffer
 
-    def _get_param_coordinator(self, training):
-        return self.parameter_offload.get_param_coordinator(training)
+    def _get_param_coordinator(self):
+        return self.parameter_offload.get_param_coordinator()
 
     def _configure_offloading(self, offload_optimizer_config, offload_param_config):
         ###################### offload optimizer setup ##################################
@@ -1498,9 +1507,13 @@ def partition_grads(self, params_to_release: List[Parameter], grad_partitions: L
                         offload_fp32_gradients[i].append(grad_buffer.float())
                         offload_fp32_offsets[i].append(dest_offset)
                     else:
+                        buffer_numel = grad_buffer.numel()
                         fp32_grad_tensor = self.fp32_partitioned_groups_flat[i].grad.narrow(
-                            0, dest_offset, grad_buffer.numel())
-                        fp32_grad_tensor.copy_(grad_buffer)
+                            0, dest_offset, buffer_numel)
+                        self.pinned_grad_buffer[:buffer_numel].copy_(
+                            grad_buffer.to(dtype=torch.float32, non_blocking=True))
+                        get_accelerator().synchronize()
+                        fp32_grad_tensor.copy_(self.pinned_grad_buffer[:buffer_numel], non_blocking=True)
 
             # free the gradient
             if not get_accelerator().is_synchronized_device():
@@ -1865,7 +1878,7 @@ def _pre_step(self):
         see_memory_usage(f"In step before checking overflow", force=False)
 
         print_rank_0("Finished Tracing at Beginning of Step")
-        self._get_param_coordinator(training=True).hierarchy = 0
+        self._get_param_coordinator().hierarchy = 0
 
         print_rank_0("Finished Tracing at Beginning of Step")
 
@@ -2249,8 +2262,6 @@ def backward(self, loss, retain_graph=False):
         else:
             self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
 
-        self._get_param_coordinator(training=True).reset_step()
-
         if self.swap_optimizer:
             self.optimizer_swapper.post_backward()
 
@@ -2730,7 +2741,7 @@ def load_hp_checkpoint_state_from_checkpoint_dir_stage3(self, checkpoint_dir, pa
         assert os.path.isfile(
             optim_state_path), f'{optim_state_path} containing optimizer global state is missing! Cannot proceed.'
 
-        optim_sd = torch.load(optim_state_path)
+        optim_sd = torch.load(optim_state_path, weights_only=False)
         self._load_global_state_stage3(optim_sd)
 
         key_list = ["fp32", "exp_avg", "exp_avg_sq"]
@@ -2788,7 +2799,7 @@ def load_hp_checkpoint_state(self, folder, key):
         local_rank = dist.get_local_rank()
 
         # Load tensors from files and reshape them to flat vectors
-        loaded_checkpoint_state = torch.load(os.path.join(folder, f"{key}.pt")).view(-1)
+        loaded_checkpoint_state = torch.load(os.path.join(folder, f"{key}.pt"), weights_only=False).view(-1)
 
         # Partition the loaded data according to the local rank
         world_size = dist.get_world_size(group=self.dp_process_group)
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index df7a2f83e3bcc..7ac89a2338087 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -1070,14 +1070,10 @@ def average_tensor(self, tensor):
             for i, param, param_id in self.params_in_ipg_bucket:
 
                 process_group = self.dp_process_group
-                grad_reduc = self.get_gradient_for_reduction(param)
-                #Averages gradients at parameter level if ipg has a moe param
-                #Otherwise averaging is done at the entire buffer level at the end of the loop
-                # MoE param have different groups
+
                 if self.ipg_bucket_has_moe_params:
                     process_group = self.expert_dp_process_group[param.group_name] if is_moe_param(
                         param) else self.dp_process_group
-                    grad_reduc.data.div_(dist.get_world_size(group=process_group) / float(self.sequence_parallel_size))
 
                 partition_ids = self.param_to_partition_ids[i][param_id]
                 assert all([p_id < dist.get_world_size(group=process_group) for p_id in partition_ids
@@ -1116,8 +1112,7 @@ def average_tensor(self, tensor):
                     curr_size += numel
                     prev_id, prev_process_group = partition_id, process_group
 
-            if not self.ipg_bucket_has_moe_params:
-                tensor.div_(dist.get_world_size(group=self.dp_process_group) / float(self.sequence_parallel_size))
+            tensor.div_(dist.get_world_size(group=self.dp_process_group) / float(self.sequence_parallel_size))
 
             buckets = {}
             for i, (dst, bucket_offset, numel) in enumerate(rank_and_offsets):
@@ -2302,11 +2297,6 @@ def load_state_dict(self,
     def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights):
         self.load_hp_checkpoint_state_from_checkpoint_dir("bit16_groups", checkpoint_folder)
 
-    @property
-    def param_groups(self):
-        """Forward the wrapped optimizer's parameters."""
-        return self.optimizer.param_groups
-
     def _load_global_state(self, sd):
         self.loss_scaler = sd.get(LOSS_SCALER, self.loss_scaler)
         self.dynamic_loss_scale = sd.get('dynamic_loss_scale', self.dynamic_loss_scale)
diff --git a/deepspeed/sequence/layer.py b/deepspeed/sequence/layer.py
index e809fe1118b51..8fd3ca9184330 100644
--- a/deepspeed/sequence/layer.py
+++ b/deepspeed/sequence/layer.py
@@ -10,6 +10,8 @@
 
 import deepspeed.comm as dist
 from deepspeed.accelerator import get_accelerator
+from deepspeed.module_inject.tp_shard import get_shard_size_list, set_num_kv_heads, get_num_kv_heads
+from deepspeed.utils import groups
 
 
 def post_all2all(scatter_idx, batch_dim_idx, seq_world_size, bs, seq_len, num_head, head_dim):
@@ -38,8 +40,132 @@ def post_func(input):
     return post_func
 
 
+def uneven_heads_all2all(input, scatter_idx, gather_idx, batch_dim_idx, group):
+    seq_world_size = dist.get_world_size(group)
+    inp_shape = list(input.shape)
+    assert batch_dim_idx in [0, 1], "batch_dim_idx must be either 0 or 1"
+
+    if not (scatter_idx < 2):
+        input_splits = get_shard_size_list(inp_shape[scatter_idx], seq_world_size)
+        input = input.transpose(0, scatter_idx).contiguous()
+        local_heads = input_splits[groups._get_sequence_parallel_rank()]
+        output_splits = [local_heads] * seq_world_size
+
+        output_buffer_shape = [seq_world_size * local_heads] + list(input.shape[1:])
+        output = torch.empty(output_buffer_shape, device=input.device, dtype=input.dtype)
+        dist.all_to_all_single(output,input,output_split_sizes=output_splits,\
+            input_split_sizes=input_splits,group=group)
+        ###[seq_ws*local_heads, ...] to [seq_ws, local_heads, ...]
+        output = output.view(seq_world_size, local_heads, *output.shape[1:])
+        ###[seq_ws,local_heads,b,seq_len,...] to [seq_ws,seq_len,b,local_heads,...]
+
+        ### batch_dim_idx=0 [seq_ws,local_heads,seq_len,b,...] to [b, seq_ws, seq_len, local_heads ...]
+        ### batch_dim_idx=1 [seq_ws,local_heads,b,seq_len,...] to [seq_ws,seq_len,b,local_heads,...]
+        if batch_dim_idx == 0:
+            order = [3, 0, 2, 1] + list(range(4, len(output.shape)))
+            output = output.permute(order).contiguous()
+            ###[b, seq_ws*local_seq_len, local_heads,...]
+            output = output.view(output.shape[0], inp_shape[gather_idx] * seq_world_size,
+                                 *output.shape[3:]).contiguous()
+        elif batch_dim_idx == 1:
+            output = output.transpose(1, 3).contiguous()
+            ###[seq_ws*local_seq_len, b, local_heads,...]
+            output = output.view(inp_shape[gather_idx] * seq_world_size, *output.shape[2:]).contiguous()
+    else:
+        # The compatibility handling of 4D and 3D tensors, standardizing to 3D.
+        input = input.reshape(input.shape[0], input.shape[1], -1)
+
+        if batch_dim_idx == 0:  #b,s,h
+            input = input.permute(1, 2, 0).contiguous()  #s,h,b
+        elif batch_dim_idx == 1:  #s,b,h
+            input = input.transpose(1, 2).contiguous()  #s,h,b
+        seq_len, h, batch_size = input.shape
+        num_local_heads_list = get_shard_size_list(get_num_kv_heads(), seq_world_size)
+        local_heads = num_local_heads_list[groups._get_sequence_parallel_rank()]
+        h_dim = h // local_heads
+        local_seq_len = seq_len // seq_world_size
+
+        input = input.view(seq_len * h, batch_size)
+        local_seq_len_with_heads = int(input.shape[0] / seq_world_size)  # dim size of local_seq_len*local_heads*hdim
+        input_splits = [local_seq_len_with_heads] * seq_world_size
+        coeff = local_seq_len_with_heads // local_heads  #per head: dim size of local_seq_len*hdim
+
+        #uneven seq_world_size coeff,  total_heads/local_heads.
+        heads_scale_coeff = get_num_kv_heads() / local_heads
+
+        output_splits = [num_local_heads * coeff for num_local_heads in num_local_heads_list]
+        output_buff_d1_size = int(heads_scale_coeff * local_seq_len_with_heads)
+        total_h = int(inp_shape[gather_idx] * heads_scale_coeff)
+        output = torch.empty(output_buff_d1_size, input.shape[1], device=input.device, dtype=input.dtype)
+        dist.all_to_all_single(output,input,output_split_sizes=output_splits, \
+            input_split_sizes=input_splits,group=group)
+        ##################
+        #suppose 7 heads divide into 4 ranks [2,2,2,1]
+        #chunk_num_heads_small=floor(7/4)=1
+        #chunk_num_heads_large=ceil(7/4)=2
+        #num_chunk_heads_large=len([2,2,2])=3, all2all_buffer_counts
+        #num_chunk_heads_small=len([1])=1, all2all_buffer_counts
+        #total_num_large_heads=sum([2,2,2])=7
+        #total_num_small_heads=sum([1])=1
+
+        chunk_num_heads_small = get_num_kv_heads() // seq_world_size  # even heads compatible
+        chunk_num_heads_large = chunk_num_heads_small + 1
+        num_chunk_heads_large = get_num_kv_heads() % seq_world_size
+        num_chunk_heads_small = seq_world_size - num_chunk_heads_large
+        total_num_large_heads = num_chunk_heads_large * chunk_num_heads_large
+        total_num_small_heads = num_chunk_heads_small * chunk_num_heads_small
+
+        heads_large_combine_size = coeff * total_num_large_heads
+        heads_small_combine_size = coeff * total_num_small_heads
+        heads_large_chunk, heads_small_chunk = output.split([heads_large_combine_size, heads_small_combine_size],
+                                                            dim=0)
+        heads_large_chunk = heads_large_chunk.view(num_chunk_heads_large, local_seq_len, chunk_num_heads_large, h_dim,
+                                                   batch_size)
+        heads_small_chunk = heads_small_chunk.view(num_chunk_heads_small, local_seq_len, chunk_num_heads_small, h_dim,
+                                                   batch_size)
+        if batch_dim_idx == 0:
+            #[all2all_buffer_counts, local_seq_len, n_heads,dim,batch]->[batch,local_seq_len,all2all_buffer_counts*n_heads,dim]
+            order = [4, 1, 0, 2, 3]
+            heads_large_chunk = heads_large_chunk.permute(order).contiguous().view(batch_size, local_seq_len,
+                                                                                   total_num_large_heads, h_dim)
+            heads_small_chunk = heads_small_chunk.permute(order).contiguous().view(batch_size, local_seq_len,
+                                                                                   total_num_small_heads, h_dim)
+        elif batch_dim_idx == 1:
+            #[all2all_buffer_counts, local_seq_len, n_heads,dim,batch]->[local_seq_len,batch,all2all_buffer_counts*n_heads,dim]
+            order = [1, 4, 0, 2, 3]
+            heads_large_chunk = heads_large_chunk.permute(order).contiguous().view(local_seq_len, batch_size,
+                                                                                   total_num_large_heads, h_dim)
+            heads_small_chunk = heads_small_chunk.permute(order).contiguous().view(local_seq_len, batch_size,
+                                                                                   total_num_small_heads, h_dim)
+
+        output = torch.cat([heads_large_chunk, heads_small_chunk], dim=2).contiguous()
+
+        inp_shape[scatter_idx] = inp_shape[scatter_idx] // seq_world_size
+        output_shape=  inp_shape[: gather_idx] + \
+            [total_h,] + \
+            inp_shape[gather_idx + 1:]
+
+        output = output.view(output_shape)
+
+    return output
+
+
 def single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, async_op=False, handle=None, type=None):
     seq_world_size = dist.get_world_size(group)
+    # we only need num_heads once
+    num_heads = input.shape[2]
+
+    if get_num_kv_heads() is not None or num_heads % seq_world_size != 0:
+        # Assuming here that the number of heads for q is consistent with kv
+        # If not, additional logic is required for cases like GQA
+        if get_num_kv_heads() is None:
+            assert num_heads > seq_world_size, f"Number of heads ({num_heads}) must be larger than sequence parallel size ({seq_world_size})"
+            # set heads at first call by num_total_heads.
+            # then use ``get_num_kv_heads() is not None`` to re-entry uneven path.
+            set_num_kv_heads(num_heads)
+        assert async_op == False, "uneven head sp does not support async op"
+        return uneven_heads_all2all(input, scatter_idx, gather_idx, batch_dim_idx, group)
+
     if batch_dim_idx == 0:
         # b, s, n, h
         if scatter_idx < 2:
diff --git a/deepspeed/utils/__init__.py b/deepspeed/utils/__init__.py
index c6a202d485eba..983e64642c696 100644
--- a/deepspeed/utils/__init__.py
+++ b/deepspeed/utils/__init__.py
@@ -16,7 +16,7 @@
 from .tensor_fragment import safe_set_full_fp32_param, safe_set_full_optimizer_state, safe_set_full_grad
 from .tensor_fragment import safe_get_local_fp32_param, safe_get_local_grad, safe_get_local_optimizer_state
 from .tensor_fragment import safe_set_local_fp32_param, safe_set_local_grad, safe_set_local_optimizer_state
-from .z3_leaf_module import set_z3_leaf_modules, unset_z3_leaf_modules, get_z3_leaf_modules, z3_leaf_module, z3_leaf_parameter
+from .z3_leaf_module import set_z3_leaf_modules, unset_z3_leaf_modules, get_z3_leaf_modules, z3_leaf_module, z3_leaf_parameter, set_z3_leaf_module
 from .mixed_precision_linkage import link_hp_params, lazy_init_hp_params_optimizer_state
 from deepspeed.runtime.dataloader import RepeatingLoader
 from .numa import get_numactl_cmd
diff --git a/deepspeed/utils/groups.py b/deepspeed/utils/groups.py
index 9dd288ef46dbb..e9550a0ec25ad 100755
--- a/deepspeed/utils/groups.py
+++ b/deepspeed/utils/groups.py
@@ -484,6 +484,8 @@ def _get_sequence_parallel_rank():
     global mpu
     if mpu is not None and hasattr(mpu, 'get_sequence_parallel_rank'):
         return mpu.get_sequence_parallel_rank()
+    if mesh_device is not None:
+        return dist.get_rank(mesh_device.get_group(mesh_dim="sequence_parallel"))
     return 0
 
 
diff --git a/deepspeed/utils/logging.py b/deepspeed/utils/logging.py
index 55fb72f6c9fba..d5db29485db39 100644
--- a/deepspeed/utils/logging.py
+++ b/deepspeed/utils/logging.py
@@ -7,8 +7,7 @@
 import logging
 import sys
 import os
-import torch
-from deepspeed.runtime.compiler import is_compile_supported
+from deepspeed.runtime.compiler import is_compile_supported, is_compiling
 
 log_levels = {
     "debug": logging.DEBUG,
@@ -26,7 +25,7 @@ def create_warning_filter(logger):
 
         def warn_once(record):
             nonlocal warn
-            if is_compile_supported() and torch.compiler.is_compiling() and not warn:
+            if is_compile_supported() and is_compiling() and not warn:
                 warn = True
                 logger.warning("To avoid graph breaks caused by logger in compile-mode, it is recommended to"
                                " disable logging by setting env var DISABLE_LOGS_WHILE_COMPILING=1")
@@ -39,7 +38,7 @@ def logging_decorator(func):
 
         @functools.wraps(func)
         def wrapper(*args, **kwargs):
-            if torch.compiler.is_compiling():
+            if is_compiling():
                 return
             else:
                 return func(*args, **kwargs)
diff --git a/deepspeed/utils/numa.py b/deepspeed/utils/numa.py
index 4fe7cbba90aeb..aba3b5179d418 100644
--- a/deepspeed/utils/numa.py
+++ b/deepspeed/utils/numa.py
@@ -23,7 +23,10 @@
 # ]
 def get_numa_cores():
     ret = []
-    output = subprocess.check_output(['numactl', '--hardware']).decode("utf-8")
+    try:
+        output = subprocess.check_output(['numactl', '--hardware']).decode("utf-8")
+    except:
+        return []
     lines = output.split('\n')
     for line in lines:
         if line.startswith('available:'):
diff --git a/deepspeed/utils/z3_leaf_module.py b/deepspeed/utils/z3_leaf_module.py
index 47d9ff698f1f0..14e8ae2d28235 100644
--- a/deepspeed/utils/z3_leaf_module.py
+++ b/deepspeed/utils/z3_leaf_module.py
@@ -4,7 +4,7 @@
 # DeepSpeed Team
 
 import torch
-from typing import List, Type
+from typing import List, Type, Union
 
 
 def z3_leaf_module(model: torch.nn.Module) -> bool:
@@ -40,18 +40,24 @@ def get_z3_leaf_modules(model: torch.nn.Module) -> List[torch.nn.Module]:
     return [module for module in model.modules() if z3_leaf_module(module)]
 
 
-def _do_set_z3_leaf_modules(model: torch.nn.Module, leaf_module_classes: List[Type],
+def set_z3_leaf_module(model: torch.nn.Module, flag: bool):
+    model._z3_leaf = flag
+
+
+def _do_set_z3_leaf_modules(model: torch.nn.Module, leaf_module_classes: Union[List[Type], List[str]],
                             flag: bool) -> List[torch.nn.Module]:
-    assert all(isinstance(module_class, type) for module_class in leaf_module_classes), \
-        f'leaf_module_classes must be a list of types, got {leaf_module_classes}'
+    assert all(isinstance(module_class, (type, str) ) for module_class in leaf_module_classes), \
+        f'leaf_module_classes must be a list of types or names, got {leaf_module_classes}'
 
     leaf_modules = []
 
     def _set_z3_leaf_flag(model: torch.nn.Module):
         nonlocal leaf_modules
-        if model.__class__ in leaf_module_classes:
-            model._z3_leaf = flag
-            leaf_modules.append(model)
+        for module in leaf_module_classes:
+            if (isinstance(module, type) and model.__class__ == module) or \
+            (isinstance(module, str) and model.__class__.__name__ == module):
+                model._z3_leaf = flag
+                leaf_modules.append(model)
 
     model.apply(_set_z3_leaf_flag)
 
@@ -61,13 +67,14 @@ def _set_z3_leaf_flag(model: torch.nn.Module):
     return leaf_modules
 
 
-def set_z3_leaf_modules(model: torch.nn.Module, leaf_module_classes: List[Type]) -> List[torch.nn.Module]:
+def set_z3_leaf_modules(model: torch.nn.Module, leaf_module_classes: Union[List[Type],
+                                                                           List[str]]) -> List[torch.nn.Module]:
     """Sets a flag within a module in `model` to instruct ZeRO3 to stop setting hooks recursively when it encounters a module class listed in `leaf_module_classes`.
        This is particularly useful in the context of Mixture of Experts (MoE) models. In MoE models, the computation order of experts varies across forward passes. This variability can disrupt ZeRO3's functionality, as ZeRO3 relies on tracking the computation order of modules to prefetch parameters efficiently. By designating a module as a 'leaf' node, ZeRO3 will prefetch parameters for all child modules upon entering the module.
        Another scenario where this functionality is beneficial is in models with excessively fine-grained nested modules, where it helps to avoid the overhead associated with hooks.
         Args:
             model (torch.nn.Module): The model to which the leaf module flag will be applied.
-            leaf_module_classes (List[Type]): A list of module classes that should be flagged as 'leaf' modules.
+            leaf_module_classes (Union[List[Type], List[str]]): A list of module classes that should be flagged as 'leaf' modules.
         Returns:
             List[torch.nn.Module]: A list of modules that match the module classes in `leaf_module_classes`.
     """
@@ -79,7 +86,7 @@ def unset_z3_leaf_modules(model: torch.nn.Module, leaf_module_classes: List[Type
         See `set_z3_leaf_modules` for more details.
         Args:
             model (torch.nn.Module): The model to which the leaf module flag will be applied.
-            leaf_module_classes (List[Type]): A list of module classes that should be flagged as 'leaf' modules.
+            leaf_module_classes (Union[List[Type], List[str]]): A list of module classes that should be flagged as 'leaf' modules.
         Returns:
             List[torch.nn.Module]: A list of modules that match the module classes in `leaf_module_classes`.
     """
diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py
index e69ecd9acb5a2..e93cb1c95f15c 100755
--- a/deepspeed/utils/zero_to_fp32.py
+++ b/deepspeed/utils/zero_to_fp32.py
@@ -21,7 +21,9 @@
 import math
 import os
 import re
+import gc
 import json
+import numpy as np
 from tqdm import tqdm
 from collections import OrderedDict
 from dataclasses import dataclass
@@ -100,7 +102,7 @@ def get_model_state_files(checkpoint_dir):
 def parse_model_states(files):
     zero_model_states = []
     for file in files:
-        state_dict = torch.load(file, map_location=device)
+        state_dict = torch.load(file, map_location=device, weights_only=False)
 
         if BUFFER_NAMES not in state_dict:
             raise ValueError(f"{file} is not a model state checkpoint")
@@ -146,8 +148,8 @@ def parse_model_states(files):
 def parse_optim_states(files, ds_checkpoint_dir):
     total_files = len(files)
     state_dicts = []
-    for f in files:
-        state_dict = torch.load(f, map_location=device)
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
         # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
         # and also handle the case where it was already removed by another helper script
         state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
@@ -179,19 +181,7 @@ def parse_optim_states(files, ds_checkpoint_dir):
     else:
         raise ValueError(f"unknown zero stage {zero_stage}")
 
-    if zero_stage <= 2:
-        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
-    elif zero_stage == 3:
-        # if there is more than one param group, there will be multiple flattened tensors - one
-        # flattened tensor per group - for simplicity merge them into a single tensor
-        #
-        # XXX: could make the script more memory efficient for when there are multiple groups - it
-        # will require matching the sub-lists of param_shapes for each param group flattened tensor
-
-        fp32_flat_groups = [
-            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
-        ]
-
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
     return zero_stage, world_size, fp32_flat_groups
 
 
@@ -398,9 +388,56 @@ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
     print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
 
 
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
 def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
     param_shapes = zero_model_states[0].param_shapes
-    avail_numel = fp32_flat_groups[0].numel() * world_size
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
     # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
     # param, re-consolidating each param, while dealing with padding if any
 
@@ -424,7 +461,8 @@ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
     offset = 0
     total_numel = 0
     total_params = 0
-    for name, shape in tqdm(param_shapes.items(), desc='Gathering Sharded Weights'):
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
         unpartitioned_numel = shape.numel()
         total_numel += unpartitioned_numel
         total_params += 1
@@ -435,10 +473,9 @@ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
                 f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
             )
 
-        # XXX: memory usage doubles here
-        state_dict[name] = torch.cat(
-            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
-            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
         offset += partitioned_numel
 
     offset *= world_size
@@ -473,7 +510,29 @@ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zer
     return state_dict
 
 
-def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:
+            shared_tensor = state_dict[converted_tensors[tensor_id]]
+            state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                state_dict[name] = tensor.contiguous()
+    return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
     """
     Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
     ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
@@ -483,14 +542,12 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_f
         - ``checkpoint_dir``: path to the desired checkpoint folder
         - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
         - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
 
     Returns:
         - pytorch ``state_dict``
 
-    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
-    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
-    the checkpoint.
-
     A typical usage might be ::
 
         from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
@@ -506,6 +563,16 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_f
 
     If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
 
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
     """
     if tag is None:
         latest_path = os.path.join(checkpoint_dir, 'latest')
@@ -520,7 +587,11 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_f
     if not os.path.isdir(ds_checkpoint_dir):
         raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
 
-    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
 
 
 def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
@@ -541,6 +612,7 @@ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
         - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
         - ``exclude_frozen_parameters``: exclude frozen parameters
     """
+
     # Dependency pre-check
     if safe_serialization:
         try:
@@ -556,13 +628,18 @@ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
             raise
 
     # Convert zero checkpoint to state_dict
-    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
 
     # Shard the model if it is too big.
     weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
     if max_shard_size is not None:
         filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
-        state_dict_split = split_torch_state_dict_into_shards(state_dict,
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
                                                               filename_pattern=filename_pattern,
                                                               max_shard_size=max_shard_size)
     else:
@@ -571,15 +648,22 @@ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
         state_dict_split = StateDictSplit(is_sharded=False,
                                           filename_to_tensors={weights_name: list(state_dict.keys())})
 
-    # Save the model
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
     filename_to_tensors = state_dict_split.filename_to_tensors.items()
     for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
-        shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
         output_path = os.path.join(output_dir, shard_file)
         if safe_serialization:
-            save_file(shard, output_path, metadata={"format": "pt"})
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
         else:
-            torch.save(shard, output_path)
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in shard_state_dict:
+            del state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
 
     # Save index if sharded
     if state_dict_split.is_sharded:
diff --git a/docker/gh-builder/Dockerfile.py311 b/docker/gh-builder/Dockerfile.py311
new file mode 100644
index 0000000000000..603fb614314fc
--- /dev/null
+++ b/docker/gh-builder/Dockerfile.py311
@@ -0,0 +1,35 @@
+# Start with NGC container
+FROM nvcr.io/nvidia/pytorch:24.03-py3
+
+# Set noninteractive mode for apt-get
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Install necessary dependencies for building Python
+RUN apt-get update && apt-get install -y \
+    wget \
+    build-essential \
+    libssl-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    curl \
+    libncursesw5-dev \
+    libgdbm-dev \
+    libc6-dev \
+    libffi-dev \
+    tk-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Download and install Python 3.11
+RUN wget https://www.python.org/ftp/python/3.11.9/Python-3.11.9.tgz \
+    && tar xzf Python-3.11.9.tgz \
+    && cd Python-3.11.9 \
+    && ./configure --enable-optimizations \
+    && make altinstall \
+    && cd .. \
+    && rm -rf Python-3.11.9 Python-3.11.9.tgz
+
+# Set Python 3.11 as the default Python version
+RUN update-alternatives --install /usr/bin/python python /usr/local/bin/python3.11 1 \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.11 1
diff --git a/docker/gh-builder/Dockerfile.py312 b/docker/gh-builder/Dockerfile.py312
new file mode 100644
index 0000000000000..a0a7193201d47
--- /dev/null
+++ b/docker/gh-builder/Dockerfile.py312
@@ -0,0 +1,35 @@
+# Start with NGC container
+FROM nvcr.io/nvidia/pytorch:24.03-py3
+
+# Set noninteractive mode for apt-get
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Install necessary dependencies for building Python
+RUN apt-get update && apt-get install -y \
+    wget \
+    build-essential \
+    libssl-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    curl \
+    libncursesw5-dev \
+    libgdbm-dev \
+    libc6-dev \
+    libffi-dev \
+    tk-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Download and install Python 3.12
+RUN wget https://www.python.org/ftp/python/3.12.5/Python-3.12.5.tgz \
+    && tar xzf Python-3.12.5.tgz \
+    && cd Python-3.12.5 \
+    && ./configure --enable-optimizations \
+    && make altinstall \
+    && cd .. \
+    && rm -rf Python-3.12.5 Python-3.12.5.tgz
+
+# Set Python 3.12 as the default Python version
+RUN update-alternatives --install /usr/bin/python python /usr/local/bin/python3.12 1 \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.12 1
diff --git a/docker/Dockerfile.rocm b/docker/rocm/Dockerfile
similarity index 100%
rename from docker/Dockerfile.rocm
rename to docker/rocm/Dockerfile
diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index adb2f1679ea0e..51e3bbd6eaaa2 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -489,6 +489,11 @@ Enabling and configuring ZeRO memory optimizations
 |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ------- |
 | Consolidate the weights before saving the model by `save_16bit_model()`. Since the weights are partitioned across GPUs, they aren't part of `state_dict`, so this function automatically gathers the weights when this option is enabled and then saves the fp16 model weights. | `False` |
 
+***stage3_module_granularity_threshold***: [integer]
+| Description                                                                                                                                                                                                                                                                    | Default |
+|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ------- |
+| The granularity of a module is determined by the ratio of `parameter_count` / `(1 + descendant_count)`. ZeRO3 classifies modules with a granularity below the threshold as fine-grained, treating them as integral units during parameter fetching. This reduces host and communication overhead from separate hooks. | `0` |
+
 ***zero_hpz_partition_size***: [integer]
 
 | Description                                                                                                                         | Default |
diff --git a/docs/_tutorials/bert-finetuning.md b/docs/_tutorials/bert-finetuning.md
index 3014be18d6820..f833acebde9af 100755
--- a/docs/_tutorials/bert-finetuning.md
+++ b/docs/_tutorials/bert-finetuning.md
@@ -10,14 +10,14 @@ In this tutorial we will be adding DeepSpeed to the BingBert model for the SQuAD
 
 If you don't already have a copy of the DeepSpeed repository, please clone in
 now and checkout the DeepSpeedExamples submodule the contains the BingBertSquad
-example (DeepSpeedExamples/BingBertSquad) we will be going over in the rest of
+example (DeepSpeedExamples/training/BingBertSquad) we will be going over in the rest of
 this tutorial.
 
 ```shell
 git clone https://github.com/microsoft/DeepSpeed
 cd DeepSpeed
 git submodule update --init --recursive
-cd DeepSpeedExamples/BingBertSquad
+cd DeepSpeedExamples/training/BingBertSquad
 ```
 
 ### Pre-requisites
diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md
index ce9e3ee9a8922..36dcdf41d9d08 100644
--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -9,7 +9,7 @@ tags: getting-started
 
 * Installing is as simple as `pip install deepspeed`, [see more details](/tutorials/advanced-install/).
 * To get started with DeepSpeed on AzureML, please see the [AzureML Examples GitHub](https://github.com/Azure/azureml-examples/tree/main/cli/jobs/deepspeed)
-* DeepSpeed has direct integrations with [HuggingFace Transformers](https://github.com/huggingface/transformers) and [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning). HuggingFace Transformers users can now easily accelerate their models with DeepSpeed through a simple ``--deepspeed`` flag + config file [See more details](https://huggingface.co/docs/transformers/main_classes/deepspeed). PyTorch Lightning provides easy access to DeepSpeed through the Lightning Trainer [See more details](https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html?highlight=deepspeed#deepspeed).
+* DeepSpeed has direct integrations with [HuggingFace Transformers](https://github.com/huggingface/transformers) and [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning). HuggingFace Transformers users can now easily accelerate their models with DeepSpeed through a simple ``--deepspeed`` flag + config file [See more details](https://huggingface.co/docs/transformers/deepspeed). PyTorch Lightning provides easy access to DeepSpeed through the Lightning Trainer [See more details](https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html?highlight=deepspeed#deepspeed).
 * DeepSpeed on AMD can be used via our [ROCm images](https://hub.docker.com/r/deepspeed/rocm501/tags), e.g., `docker pull deepspeed/rocm501:ds060_pytorch110`.
 * DeepSpeed also supports Intel Xeon CPU, Intel Data Center Max Series XPU, Intel Gaudi HPU, Huawei Ascend NPU etc, please refer to the [accelerator setup guide](/tutorials/accelerator-setup-guide/)
 
diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index b1a8b53697610..e66bba3f818b2 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -136,7 +136,7 @@ You can also use a pre-trained BERT model checkpoint from either DeepSpeed, [Hug
 
 ### 2.1 Running BingBertSQuAD with DeepSpeed and 1-bit Adam
 
-We provide example scripts under [DeepSpeedExamples/BingBertSquad/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/BingBertSquad/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
+We provide example scripts under [DeepSpeedExamples/training/BingBertSquad/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/training/BingBertSquad/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
 
 <!-- The main part of training is done in `nvidia_run_squad_deepspeed.py`, which has
 already been modified to use DeepSpeed. The `run_squad_deepspeed.sh` script
@@ -157,7 +157,7 @@ To enable the 1-bit compressed training, 1-bit Adam uses an MPI library (E.g. MV
 
 ### Launch with deepspeed
 
-The following helper script in the DeepSpeedExamples/BingBertSQuAD will launch the training without the need for setting any `mpirun` parameters. The number of nodes and GPUs will be automatically detected and the job will be launched on all the available resources.
+The following helper script in the DeepSpeedExamples/training/BingBertSQuAD will launch the training without the need for setting any `mpirun` parameters. The number of nodes and GPUs will be automatically detected and the job will be launched on all the available resources.
 
 ```shell
 bash run_squad_deepspeed_onebitadam.sh <PATH_TO_OUTPUT_DIR>
diff --git a/docs/_tutorials/zero.md b/docs/_tutorials/zero.md
index c84339ece9e58..0bb95cbcddd86 100644
--- a/docs/_tutorials/zero.md
+++ b/docs/_tutorials/zero.md
@@ -13,7 +13,7 @@ ZeRO leverages the aggregate computation and memory resources of data parallelis
 
 * **Stage 1**: The optimizer states (e.g., for [Adam optimizer](https://arxiv.org/abs/1412.6980), 32-bit weights, and the first, and second moment estimates) are partitioned across the processes, so that each process updates only its partition.
 
-* **Stage 2**: The reduced 32-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
+* **Stage 2**: The reduced 16-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
 
 * **Stage 3**: The 16-bit model parameters are partitioned across the processes. ZeRO-3 will automatically collect and partition them during the forward and backward passes.
 
diff --git a/docs/code-docs/source/index.rst b/docs/code-docs/source/index.rst
index 67d5aa5fe9fb3..93ebccaf0de25 100644
--- a/docs/code-docs/source/index.rst
+++ b/docs/code-docs/source/index.rst
@@ -71,7 +71,7 @@ Optimizers
    optimizers
 
 Learning Rate Schedulers
---------------------
+------------------------
 .. toctree::
    :maxdepth: 2
 
diff --git a/docs/code-docs/source/inference-engine.rst b/docs/code-docs/source/inference-engine.rst
index 00a745fd7078d..67df1e9c314d5 100644
--- a/docs/code-docs/source/inference-engine.rst
+++ b/docs/code-docs/source/inference-engine.rst
@@ -1,5 +1,5 @@
 Inference API
-============
+=============
 
 :func:`deepspeed.init_inference` returns an *inference engine*
 of type :class:`InferenceEngine`.
diff --git a/docs/code-docs/source/initialize.rst b/docs/code-docs/source/initialize.rst
index 492c42fe9ef68..dd69a5dec4d24 100644
--- a/docs/code-docs/source/initialize.rst
+++ b/docs/code-docs/source/initialize.rst
@@ -38,7 +38,7 @@ Example usage:
 .. autofunction:: deepspeed.initialize
 
 Distributed Initialization
------------------------
+--------------------------
 Optional distributed backend initialization separate from ``deepspeed.initialize()``. Useful in scenarios where the user wants to use torch distributed calls before calling ``deepspeed.initialize()``, such as when using model parallelism, pipeline parallelism, or certain data loader scenarios.
 
 .. autofunction:: deepspeed.init_distributed
diff --git a/docs/code-docs/source/moe.rst b/docs/code-docs/source/moe.rst
index 10634a27b9496..097a4b0bc27df 100644
--- a/docs/code-docs/source/moe.rst
+++ b/docs/code-docs/source/moe.rst
@@ -1,5 +1,5 @@
 Mixture of Experts (MoE)
-====================
+========================
 
 Layer specification
 --------------------
diff --git a/docs/code-docs/source/schedulers.rst b/docs/code-docs/source/schedulers.rst
index 4b2e2b14609b4..1bbe9594f77ad 100755
--- a/docs/code-docs/source/schedulers.rst
+++ b/docs/code-docs/source/schedulers.rst
@@ -1,5 +1,5 @@
 Learning Rate Schedulers
-===================
+=================================
 
 DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR``, ``WarmupCosineLR`` learning rate schedulers. When using a DeepSpeed's learning rate scheduler (specified in the `ds_config.json` file), DeepSpeed calls the `step()` method of the scheduler at every training step (when `model_engine.step()` is executed). When not using a DeepSpeed's learning rate scheduler:
   * if the schedule is supposed to execute at every training step, then the user can pass the scheduler to `deepspeed.initialize` when initializing the DeepSpeed engine and let DeepSpeed manage it for update or save/restore.
diff --git a/docs/code-docs/source/zero3.rst b/docs/code-docs/source/zero3.rst
index aa8139a654a10..d97e05b367fa9 100644
--- a/docs/code-docs/source/zero3.rst
+++ b/docs/code-docs/source/zero3.rst
@@ -10,7 +10,7 @@ communication efficiency.
 
 #. **ZeRO Stage 1**: The optimizer states (e.g., for `Adam optimizer <https://arxiv.org/abs/1412.6980>`_, 32-bit weights, and the first, and second moment estimates) are partitioned across the processes, so that each process updates only its partition.
 
-#. **ZeRO Stage 2**: The reduced 32-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
+#. **ZeRO Stage 2**: The reduced 16-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
 
 #. **ZeRO Stage 3**: The 16-bit model parameters are partitioned across the processes. ZeRO-3 will automatically collect and partition them during the forward and backward passes.
 
@@ -310,6 +310,7 @@ DeepSpeed can automatically detect the following external parameter scenarios:
 
 
 .. `Module.apply <https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=module+apply#torch.nn.Module.apply>`_
+
 Overriding Module.apply
 ===============================
 A convenient mechanism for customizing model initialization is `Module.apply <https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=module+apply#torch.nn.Module.apply>`_.
diff --git a/docs/index.md b/docs/index.md
index 1efdcea132d2e..3279682b42d41 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -94,7 +94,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram
 
 |                                                                                                | Documentation                                |
 | ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
-| <img src="assets/images/transformers-light.png" width="300px"> | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/main/main_classes/deepspeed) |
+| <img src="assets/images/transformers-light.png" width="300px"> | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/deepspeed) |
 | <img src="assets/images/accelerate-light.png" width="300px">| [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) |
 | <img src="assets/images/lightning-light.svg" width="250px"> | [Lightning with DeepSpeed](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html) |
 | <img src="assets/images/mosaicml.svg" width="250px"> | [MosaicML with DeepSpeed](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) |
diff --git a/examples/README.md b/examples/README.md
index 5dfc26c176135..14393ef4545d8 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -5,5 +5,5 @@ If you are looking for examples using DeepSpeed please see the following resourc
 1. [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)
 2. [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed)
 3. [DeepSpeed + AzureML](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed)
-4. [DeepSpeed + Hugging Face Transformers Integration](https://huggingface.co/docs/transformers/main_classes/deepspeed)
+4. [DeepSpeed + Hugging Face Transformers Integration](https://huggingface.co/docs/transformers/deepspeed)
 5. [DeepSpeed + PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.utilities.deepspeed.html)
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 1609bc9005f46..461281d4a5699 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -67,7 +67,7 @@ def get_default_compute_capabilities():
             # Special treatment of CUDA 11.0 because compute_86 is not supported.
             compute_caps += ";8.0"
         else:
-            compute_caps += ";8.0;8.6"
+            compute_caps += ";8.0;8.6;9.0"
     return compute_caps
 
 
diff --git a/op_builder/xpu/async_io.py b/op_builder/xpu/async_io.py
index 6a6798eaeb9cf..2da963ae64aa4 100644
--- a/op_builder/xpu/async_io.py
+++ b/op_builder/xpu/async_io.py
@@ -31,7 +31,7 @@ def sources(self):
             'csrc/aio/common/deepspeed_aio_types.cpp',
             'csrc/aio/py_lib/deepspeed_pin_tensor.cpp',
             'csrc/aio/py_lib/deepspeed_py_io_handle.cpp',
-            'csrc/xpu/aio/deepspeed_cpu_op.cpp',
+            'csrc/aio/py_lib/deepspeed_cpu_op.cpp',
             'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp',
         ]
 
diff --git a/setup.py b/setup.py
index e39d8c7e05a3c..c0452f867b31e 100755
--- a/setup.py
+++ b/setup.py
@@ -202,8 +202,8 @@ def op_enabled(op_name):
 print(f'Install Ops={install_ops}')
 
 # Write out version/git info.
-git_hash_cmd = shlex.split("bash -c git rev-parse --short HEAD")
-git_branch_cmd = shlex.split("bash -c git rev-parse --abbrev-ref HEAD")
+git_hash_cmd = shlex.split("bash -c \"git rev-parse --short HEAD\"")
+git_branch_cmd = shlex.split("bash -c \"git rev-parse --abbrev-ref HEAD\"")
 if command_exists('git') and not is_env_set('DS_BUILD_STRING'):
     try:
         result = subprocess.check_output(git_hash_cmd)
diff --git a/tests/model/BingBertSquad/run_BingBertSquad.sh b/tests/model/BingBertSquad/run_BingBertSquad.sh
index fcfdf5e663613..4d06bb1230a41 100755
--- a/tests/model/BingBertSquad/run_BingBertSquad.sh
+++ b/tests/model/BingBertSquad/run_BingBertSquad.sh
@@ -93,7 +93,7 @@ done
 
 # Validate path to BingBertSquad script
 if [ -z "${BingBertSquad_DIR+x}" ]; then
-  export BingBertSquad_DIR=../../../../DeepSpeedExamples/BingBertSquad
+  export BingBertSquad_DIR=../../../../DeepSpeedExamples/training/BingBertSquad
   echo "BingBertSquad_DIR environment variable not set; trying default: ${BingBertSquad_DIR}"
 fi
 validate_folder ${BingBertSquad_DIR} "BingBertSquad_DIR"
@@ -160,8 +160,11 @@ run_cmd="deepspeed.pt \
       --master_port ${master_port}
       ${BingBertSquad_script} ${other_args} ${squad_args}"
 
-echo ${run_cmd}
-eval ${run_cmd}
+# Sanitize input before running eval()
+safe_cmd=$(printf '%q' "$run_cmd")
+
+echo ${safe_cmd}
+eval ${safe_cmd}
 
 set +x
 
diff --git a/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh b/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh
index 1b49a37b783f0..8b6ad942ba599 100755
--- a/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh
+++ b/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh
@@ -94,7 +94,7 @@ done
 
 # Validate path to BingBertSquad script
 if [ -z "${BingBertSquad_DIR+x}" ]; then
-  export BingBertSquad_DIR=../../../DeepSpeedExamples/BingBertSquad
+  export BingBertSquad_DIR=../../../DeepSpeedExamples/training/BingBertSquad
   echo "BingBertSquad_DIR environment variable not set; trying default: ${BingBertSquad_DIR}"
 fi
 validate_folder ${BingBertSquad_DIR} "BingBertSquad_DIR"
diff --git a/tests/model/BingBertSquad/run_tests.sh b/tests/model/BingBertSquad/run_tests.sh
index eef93ef98796a..2a69fdf01c790 100755
--- a/tests/model/BingBertSquad/run_tests.sh
+++ b/tests/model/BingBertSquad/run_tests.sh
@@ -31,7 +31,7 @@ validate_folder() {
 
 # Validate path to BingBertSquad script
 if [ -z "${BingBertSquad_DIR+x}" ]; then
-  export BingBertSquad_DIR=../../../DeepSpeedExamples/BingBertSquad
+  export BingBertSquad_DIR=../../../DeepSpeedExamples/training/BingBertSquad
   echo "BingBertSquad_DIR environment variable not set; trying default: ${BingBertSquad_DIR}"
 fi
 validate_folder ${BingBertSquad_DIR} "BingBertSquad_DIR"
diff --git a/tests/model/BingBertSquad/test_e2e_squad.py b/tests/model/BingBertSquad/test_e2e_squad.py
index 9312dc67a1931..9f03b89d08298 100644
--- a/tests/model/BingBertSquad/test_e2e_squad.py
+++ b/tests/model/BingBertSquad/test_e2e_squad.py
@@ -10,11 +10,11 @@
 import pytest
 import json
 
-sys.path.append("../../../DeepSpeedExamples/BingBertSquad")
+sys.path.append("../../../DeepSpeedExamples/training/BingBertSquad")
 import evaluate as eval
 
 squad_dir = "/data/BingBertSquad"
-base_dir = "../../../DeepSpeedExamples/BingBertSquad"
+base_dir = "../../../DeepSpeedExamples/training/BingBertSquad"
 
 script_file_name = "run_squad_deepspeed.sh"
 model_file_name = "training_state_checkpoint_162.tar"
diff --git a/tests/unit/checkpoint/common.py b/tests/unit/checkpoint/common.py
index 3fb13b214ea0a..001c08f1a99ff 100644
--- a/tests/unit/checkpoint/common.py
+++ b/tests/unit/checkpoint/common.py
@@ -218,7 +218,7 @@ def checkpoint_correctness_verification(config_dict,
     for root, _, files in os.walk(save_folder):
         for f in files:
             if "_expert_" in f and "_model_states" in f:
-                expert = torch.load(os.path.join(root, f))
+                expert = torch.load(os.path.join(root, f), weights_only=False)
                 needed, storages = 0, {}
                 for name, tensor in expert.items():
                     needed += tensor.size().numel()
diff --git a/tests/unit/checkpoint/test_convert_checkpoint.py b/tests/unit/checkpoint/test_convert_checkpoint.py
new file mode 100644
index 0000000000000..68fdecb32e16b
--- /dev/null
+++ b/tests/unit/checkpoint/test_convert_checkpoint.py
@@ -0,0 +1,60 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn as nn
+
+import deepspeed
+from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict
+from unit.common import DistributedTest
+
+
+class ModelWithSharedWeights(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.layer0 = nn.Linear(100, 100)
+        self.layer1 = nn.Linear(200, 200)
+        self.layer2 = nn.Linear(300, 300)
+        # tie layer 1 and layer 2
+        self.layer1.weight = self.layer2.weight
+
+
+class TestCheckpointConvert(DistributedTest):
+    world_size = 2
+
+    def test_convert_zero_checkpoint_to_fp32_state_dict(self, tmpdir):
+        config = {
+            "train_micro_batch_size_per_gpu": 2,
+            "zero_allow_untested_optimizer": True,
+            "zero_optimization": {
+                "stage": 3
+            },
+        }
+        model = ModelWithSharedWeights()
+        optimizer = torch.optim.Adam(model.parameters())
+
+        deepspeed_engine, _, _, _ = deepspeed.initialize(
+            config=config,
+            model=model,
+            optimizer=optimizer,
+        )
+        ds_save_dir = tmpdir / "checkpoint_ds"
+        deepspeed_engine.save_checkpoint(ds_save_dir, tag="checkpoint")
+
+        model = ModelWithSharedWeights()
+
+        # save checkpoint
+        fp32_save_dir = tmpdir / "checkpoint_fp32"
+        convert_zero_checkpoint_to_fp32_state_dict(ds_save_dir, fp32_save_dir)
+
+        # load state_dict from fp32 checkpoint
+        state_dict = torch.load(fp32_save_dir / 'pytorch_model.bin')
+
+        # check shared tensor
+        assert id(state_dict['layer1.weight']) == id(state_dict['layer2.weight'])
+
+        # load state_dict into model
+        model.load_state_dict(state_dict, strict=True)
diff --git a/tests/unit/checkpoint/test_universal_checkpoint.py b/tests/unit/checkpoint/test_universal_checkpoint.py
index f2692ecba3a6e..46d4294bdd0d1 100644
--- a/tests/unit/checkpoint/test_universal_checkpoint.py
+++ b/tests/unit/checkpoint/test_universal_checkpoint.py
@@ -131,8 +131,7 @@ def train_save_convert(ds_config, hidden_dim, load_optim, use_torch_adam, dtype,
         torch.save((model_state, optimizer_state), os.path.join(tmpdir, "baseline_state.pt"))
 
     dist.barrier()
-
-    return model, sd
+    model.destroy()
 
 
 @pytest.fixture
@@ -182,7 +181,7 @@ def _run_test(self, tmpdir, dtype, ds_config, load_optim, use_torch_adam):
             )
 
         hidden_dim = 10
-        loaded_model_state, loaded_optimizer_state = torch.load(f"{tmpdir}/baseline_state.pt")
+        loaded_model_state, loaded_optimizer_state = torch.load(f"{tmpdir}/baseline_state.pt", weights_only=False)
 
         ds_config["checkpoint"] = {"load_universal": True}
         univ_model = SimpleModel(hidden_dim)
@@ -213,6 +212,8 @@ def _run_test(self, tmpdir, dtype, ds_config, load_optim, use_torch_adam):
             univ_model.backward(loss)
             univ_model.step()
 
+        univ_model.destroy()
+
     @pytest.mark.world_size(2)
     def test_dp_world_size_2to2(self, baseline_ws2, tmpdir, dtype, ds_config, load_optim, use_torch_adam):
         self._run_test(tmpdir, dtype, ds_config, load_optim, use_torch_adam)
diff --git a/tests/unit/checkpoint/test_zero_optimizer.py b/tests/unit/checkpoint/test_zero_optimizer.py
index 84b4eca6e2cae..44966b331d0ff 100644
--- a/tests/unit/checkpoint/test_zero_optimizer.py
+++ b/tests/unit/checkpoint/test_zero_optimizer.py
@@ -264,7 +264,7 @@ def test_elastic_checkpoint_fixed_dp(self, tmpdir, elastic_save, elastic_load, l
         model.load_checkpoint(tmpdir, load_optimizer_states=load_optim)
 
         if load_optim:
-            saved_sd = torch.load(os.path.join(tmpdir, opt_state_dict_file))
+            saved_sd = torch.load(os.path.join(tmpdir, opt_state_dict_file), weights_only=False)
             curr_sd = model.optimizer.optimizer.state_dict()
             compare_opt_state_dicts(curr_sd, saved_sd, expected_mismatch_keys)
 
@@ -523,7 +523,7 @@ def test_save_exclude_frozen_weights(self, tmpdir, zero_stage):
         all_ckpt_folder = os.path.join(tmpdir, 'all_params')
         ds_engine.save_checkpoint(all_ckpt_folder)
         all_params_ckpt_file = get_model_ckpt_name_for_rank(os.path.join(all_ckpt_folder, 'global_step0'), '00')
-        loaded_all_param_model = torch.load(all_params_ckpt_file)['module']
+        loaded_all_param_model = torch.load(all_params_ckpt_file, weights_only=False)['module']
         all_param_names = set([n for n, p in model.named_parameters()])
         assert set(loaded_all_param_model.keys()) == all_param_names
 
@@ -536,7 +536,7 @@ def test_save_exclude_frozen_weights(self, tmpdir, zero_stage):
         # Excluding frozen parameters should reduce checkpoint size
         assert os.path.getsize(all_params_ckpt_file) > os.path.getsize(trainable_ckpt_file)
 
-        loaded_trainable_param_model = torch.load(trainable_ckpt_file)['module']
+        loaded_trainable_param_model = torch.load(trainable_ckpt_file, weights_only=False)['module']
         frozen_param_names = set([n for n, p in model.named_parameters() if not p.requires_grad])
         loaded_trainable_param_names = set(loaded_trainable_param_model.keys())
         overlap_names = set.intersection(loaded_trainable_param_names, frozen_param_names)
@@ -575,7 +575,7 @@ def test_save_exclude_custom_frozen_weights(self, tmpdir, zero_stage):
 
         custom_state_dict_ckpt_file = get_model_ckpt_name_for_rank(
             os.path.join(custom_state_dict_ckpt_folder, 'global_step0'), '00')
-        loaded_custom_state_dict_param_model = torch.load(custom_state_dict_ckpt_file)['module']
+        loaded_custom_state_dict_param_model = torch.load(custom_state_dict_ckpt_file, weights_only=False)['module']
         loaded_custom_state_dict_param_names = set(loaded_custom_state_dict_param_model.keys())
 
         custom_state_dict_param_names = set([k for k, v in model.state_dict().items()])
@@ -618,7 +618,8 @@ def test_save_tensor_clone(self, tmpdir, zero_stage, use_cpu_device):
         clone_ckpt_file = os.path.join(tmpdir, 'clone_ckpt.pt')
         torch.save(clone_state_dict, clone_ckpt_file)
 
-        compare_state_dicts(torch.load(ref_ckpt_file), torch.load(clone_ckpt_file))
+        compare_state_dicts(torch.load(ref_ckpt_file, weights_only=False),
+                            torch.load(clone_ckpt_file, weights_only=False))
 
 
 class TestZeRONonDistributed(DistributedTest):
diff --git a/tests/unit/common.py b/tests/unit/common.py
index 685f943df2fec..1498b0400ee1f 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -25,8 +25,6 @@
 # Worker timeout for tests that hang
 DEEPSPEED_TEST_TIMEOUT = int(os.environ.get('DS_UNITTEST_TIMEOUT', '600'))
 
-warn_reuse_dist_env = False
-
 
 def is_rocm_pytorch():
     return hasattr(torch.version, 'hip') and torch.version.hip is not None
@@ -178,13 +176,6 @@ def _launch_daemonic_procs(self, num_procs, init_method):
                 print("Ignoring reuse_dist_env for hpu")
                 self.reuse_dist_env = False
 
-        global warn_reuse_dist_env
-        if self.reuse_dist_env and not warn_reuse_dist_env:
-            # Currently we see memory leak for tests that reuse distributed environment
-            print("Ignoring reuse_dist_env and forcibly setting it to False")
-            warn_reuse_dist_env = True
-        self.reuse_dist_env = False
-
         if self.reuse_dist_env:
             if num_procs not in self._pool_cache:
                 self._pool_cache[num_procs] = mp.Pool(processes=num_procs)
@@ -286,7 +277,11 @@ def _launch_procs(self, num_procs, init_method):
             self._launch_daemonic_procs(num_procs, init_method)
 
     def _dist_run(self, local_rank, num_procs, master_port, init_method, skip_msg=""):
-        if not dist.is_initialized():
+        if dist.is_initialized():
+            if get_accelerator().is_available():
+                # local_rank might not match the rank in the previous run if you are reusing the environment
+                get_accelerator().set_device(dist.get_rank())
+        else:
             """ Initialize deepspeed.comm and execute the user function. """
             if self.set_dist_env:
                 os.environ['MASTER_ADDR'] = '127.0.0.1'
diff --git a/tests/unit/model_parallelism/test_configurable_parallel_mp.py b/tests/unit/model_parallelism/test_configurable_parallel_mp.py
index cca1ef3584add..a7b0d3431ee9d 100644
--- a/tests/unit/model_parallelism/test_configurable_parallel_mp.py
+++ b/tests/unit/model_parallelism/test_configurable_parallel_mp.py
@@ -170,7 +170,7 @@ def test(self, baseline_mp2, inputs, class_tmpdir):
             test = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name))
             if dist.get_rank() == 0:
                 load_path = os.path.join(class_tmpdir, "output.pt")
-                baseline = torch.load(load_path)
+                baseline = torch.load(load_path, weights_only=False)
                 test = test.cpu()
                 assert torch.allclose(
                     baseline, test,
diff --git a/tests/unit/model_parallelism/test_configurable_parallel_pp.py b/tests/unit/model_parallelism/test_configurable_parallel_pp.py
index e50fd18577b15..df469044e186a 100644
--- a/tests/unit/model_parallelism/test_configurable_parallel_pp.py
+++ b/tests/unit/model_parallelism/test_configurable_parallel_pp.py
@@ -225,7 +225,7 @@ def _test(self, inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resiz
                 assert torch.is_tensor(test[0][0])
                 test = test[0][0].cpu()
                 load_path = os.path.join(class_tmpdir, f"output-{checkpoint_tag}.pt")
-                baseline = torch.load(load_path)
+                baseline = torch.load(load_path, weights_only=False)
                 assert torch.allclose(
                     baseline, test,
                     atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}"
diff --git a/tests/unit/moe/test_moe.py b/tests/unit/moe/test_moe.py
index f65d5e2a03bca..9ee546437f6cc 100644
--- a/tests/unit/moe/test_moe.py
+++ b/tests/unit/moe/test_moe.py
@@ -7,6 +7,7 @@
 import deepspeed
 import pytest
 import gc
+import random
 from unit.common import DistributedTest
 from unit.simple_model import SimplePRMoEModel, SimpleMoEModel, sequence_dataloader
 import deepspeed.comm as dist
@@ -238,3 +239,114 @@ def check_equal(logits, cap, sparse_truth, res):
                                             [2, 1, 1], [2, 2, 1], [2, 3, 1], [3, 0, 0]])
         position_dispatch_res = topkgating(logits2, 3, 1, min_capacity=1, drop_policy='position')[2]
         check_equal(logits2, 2, position_sec_sparse, position_dispatch_res)
+
+
+class TestExpertWeightGradWithZero(DistributedTest):
+    world_size = 2
+
+    @pytest.mark.parametrize("zero_stage", [0, 1, 2])
+    def test(self, zero_stage):
+
+        if not required_torch_version(min_version=1.8):
+            pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
+
+        def seed_everything(seed=11):
+            random.seed(seed)
+            torch.manual_seed(seed)
+            get_accelerator().manual_seed(seed)
+            get_accelerator().manual_seed_all(seed)
+            torch.backends.cudnn.deterministic = True
+            torch.backends.cudnn.benchmark = False
+
+        def get_state_dict_ep2(state_dict):
+            """
+            convert state_dict from EP=1 to EP=2
+            """
+            rank = int(deepspeed.comm.get_rank())
+            ep_state_dict = dict()
+            dst_sub_key = f"deepspeed_moe.experts.deepspeed_experts.0"
+            src_sub_key = f"deepspeed_moe.experts.deepspeed_experts.{rank}"
+            for moe_layer in ["moe_1", "moe_2"]:
+                for mlp_in_moe in [0, 1]:
+                    dst_key = f"{moe_layer}.{dst_sub_key}.{mlp_in_moe}"
+                    src_key = f"{moe_layer}.{src_sub_key}.{mlp_in_moe}"
+                    ep_state_dict[f"{dst_key}.weight"] = state_dict[f"{src_key}.weight"].detach().clone()
+                    ep_state_dict[f"{dst_key}.bias"] = state_dict[f"{src_key}.bias"].detach().clone()
+
+            for key in state_dict.keys():
+                if "deepspeed_moe.experts.deepspeed_experts" not in key:
+                    ep_state_dict[key] = state_dict[key].detach().clone()
+            return ep_state_dict
+
+        def get_models(hidden_dim):
+            model_ep1 = SimpleMoEModel(hidden_dim=hidden_dim, num_experts=2, ep_size=1, use_rts=False)
+            model_ep2 = SimpleMoEModel(hidden_dim=hidden_dim, num_experts=2, ep_size=2, use_rts=False)
+
+            state_dict_ep1 = model_ep1.state_dict()
+            state_dict_ep2 = get_state_dict_ep2(state_dict_ep1)
+            model_ep2.load_state_dict(state_dict_ep2)
+
+            model_ep1, _, _, _ = deepspeed.initialize(config=config_dict, model=model_ep1)
+            model_ep2, _, _, _ = deepspeed.initialize(config=config_dict, model=model_ep2)
+
+            return model_ep1, model_ep2
+
+        def extract_expert_grad(model, expert_id):
+
+            def _get_weight_bias(experts):
+                return ([deepspeed.utils.safe_get_full_grad(expert[0].weight)
+                         for expert in experts][expert_id].detach().clone(),
+                        [deepspeed.utils.safe_get_full_grad(expert[0].bias)
+                         for expert in experts][expert_id].detach().clone(),
+                        [deepspeed.utils.safe_get_full_grad(expert[1].weight)
+                         for expert in experts][expert_id].detach().clone(),
+                        [deepspeed.utils.safe_get_full_grad(expert[1].bias)
+                         for expert in experts][expert_id].detach().clone())
+
+            return (*_get_weight_bias(model.moe_1.deepspeed_moe.experts.deepspeed_experts),
+                    *_get_weight_bias(model.moe_2.deepspeed_moe.experts.deepspeed_experts))
+
+        seed_everything()
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.1,
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage
+            }
+        }
+
+        hidden_dim = 4
+        total_samples = 2
+        rank = deepspeed.comm.get_rank()
+        model_ep1, model_ep2 = get_models(hidden_dim)
+
+        data_loader = sequence_dataloader(model=model_ep1,
+                                          total_samples=total_samples,
+                                          hidden_dim=hidden_dim,
+                                          device=model_ep1.device,
+                                          dtype=torch.float32)
+        expert_weight_grad_ep1 = []
+        expert_weight_grad_ep2 = []
+        for batch in data_loader:
+            loss_ep1 = model_ep1(batch[0], batch[1])
+            loss_ep2 = model_ep2(batch[0], batch[1])
+
+            model_ep1.backward(loss_ep1)
+            model_ep2.backward(loss_ep2)
+
+            expert_weight_grad_ep1.extend(extract_expert_grad(model_ep1, rank))
+            expert_weight_grad_ep2.extend(extract_expert_grad(model_ep2, 0))
+
+            model_ep1.step()
+            model_ep2.step()
+
+        assert len(expert_weight_grad_ep1) == len(expert_weight_grad_ep2)
+        for grad_from_ep1, grad_from_ep2 in zip(expert_weight_grad_ep1, expert_weight_grad_ep2):
+            assert torch.allclose(grad_from_ep1, grad_from_ep2, atol=0, rtol=1e-4)
diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py
index a074cfca317fe..1aa5f647a8aad 100644
--- a/tests/unit/ops/aio/test_aio.py
+++ b/tests/unit/ops/aio/test_aio.py
@@ -35,16 +35,21 @@ def _get_local_rank():
     return 0
 
 
-def _do_ref_write(tmpdir, index=0):
+def _do_ref_write(tmpdir, index=0, file_size=IO_SIZE):
     file_suffix = f'{_get_local_rank()}_{index}'
     ref_file = os.path.join(tmpdir, f'_py_random_{file_suffix}.pt')
-    ref_buffer = os.urandom(IO_SIZE)
+    ref_buffer = os.urandom(file_size)
     with open(ref_file, 'wb') as f:
         f.write(ref_buffer)
 
     return ref_file, ref_buffer
 
 
+def _get_file_path(tmpdir, file_prefix, index=0):
+    file_suffix = f'{_get_local_rank()}_{index}'
+    return os.path.join(tmpdir, f'{file_prefix}_{file_suffix}.pt')
+
+
 def _get_test_write_file(tmpdir, index):
     file_suffix = f'{_get_local_rank()}_{index}'
     return os.path.join(tmpdir, f'_aio_write_random_{file_suffix}.pt')
@@ -103,7 +108,7 @@ def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, over
         _validate_handle_state(h, single_submit, overlap_events)
 
         ref_file, _ = _do_ref_write(tmpdir)
-        read_status = h.sync_pread(aio_buffer, ref_file)
+        read_status = h.sync_pread(aio_buffer, ref_file, 0)
         assert read_status == 1
 
         with open(ref_file, 'rb') as f:
@@ -131,7 +136,7 @@ def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap
         _validate_handle_state(h, single_submit, overlap_events)
 
         ref_file, _ = _do_ref_write(tmpdir)
-        read_status = h.async_pread(aio_buffer, ref_file)
+        read_status = h.async_pread(aio_buffer, ref_file, 0)
         assert read_status == 0
 
         wait_status = h.wait()
@@ -172,7 +177,7 @@ def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, ove
 
         _validate_handle_state(h, single_submit, overlap_events)
 
-        write_status = h.sync_pwrite(aio_buffer, aio_file)
+        write_status = h.sync_pwrite(aio_buffer, aio_file, 0)
         assert write_status == 1
 
         if not use_cuda_pinned_tensor:
@@ -201,7 +206,7 @@ def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overla
 
         _validate_handle_state(h, single_submit, overlap_events)
 
-        write_status = h.async_pwrite(aio_buffer, aio_file)
+        write_status = h.async_pwrite(aio_buffer, aio_file, 0)
         assert write_status == 0
 
         wait_status = h.wait()
@@ -258,7 +263,7 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, use_unpinned_te
         _validate_handle_state(h, single_submit, overlap_events)
 
         for i in range(async_queue):
-            read_status = h.async_pread(aio_buffers[i], ref_files[i])
+            read_status = h.async_pread(aio_buffers[i], ref_files[i], 0)
             assert read_status == 0
 
         wait_status = h.wait()
@@ -305,7 +310,7 @@ def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, use_unpinned_t
         _validate_handle_state(h, single_submit, overlap_events)
 
         for i in range(async_queue):
-            read_status = h.async_pwrite(aio_buffers[i], aio_files[i])
+            read_status = h.async_pwrite(aio_buffers[i], aio_files[i], 0)
             assert read_status == 0
 
         wait_status = h.wait()
@@ -320,3 +325,79 @@ def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, use_unpinned_t
 
             filecmp.clear_cache()
             assert filecmp.cmp(ref_files[i], aio_files[i], shallow=False)
+
+
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
+@pytest.mark.parametrize('file_partitions', [[1, 1, 1], [1, 1, 2], [1, 2, 1], [2, 1, 1]])
+class TestAsyncFileOffset(DistributedTest):
+    world_size = 1
+
+    def test_offset_write(self, tmpdir, file_partitions, use_cuda_pinned_tensor):
+
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+        ref_file = _get_file_path(tmpdir, '_py_random')
+        aio_file = _get_file_path(tmpdir, '_aio_random')
+        partition_unit_size = BLOCK_SIZE
+        file_size = sum(file_partitions) * partition_unit_size
+
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, True, True, IO_PARALLEL)
+
+        if use_cuda_pinned_tensor:
+            data_buffer = torch.ByteTensor(list(os.urandom(file_size))).pin_memory()
+        else:
+            data_buffer = h.new_cpu_locked_tensor(file_size, torch.empty(0, dtype=torch.uint8))
+
+        file_offsets = []
+        next_offset = 0
+        for i in range(len(file_partitions)):
+            file_offsets.append(next_offset)
+            next_offset += file_partitions[i] * partition_unit_size
+
+        ref_fd = open(ref_file, 'wb')
+        for i in range(len(file_partitions)):
+            src_buffer = torch.narrow(data_buffer, 0, file_offsets[i], file_partitions[i] * partition_unit_size)
+
+            ref_fd.write(src_buffer.numpy().tobytes())
+            ref_fd.flush()
+
+            assert 1 == h.sync_pwrite(buffer=src_buffer, filename=aio_file, file_offset=file_offsets[i])
+
+            filecmp.clear_cache()
+            assert filecmp.cmp(ref_file, aio_file, shallow=False)
+
+        ref_fd.close()
+
+        if not use_cuda_pinned_tensor:
+            h.free_cpu_locked_tensor(data_buffer)
+
+    def test_offset_read(self, tmpdir, file_partitions, use_cuda_pinned_tensor):
+
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+        partition_unit_size = BLOCK_SIZE
+        file_size = sum(file_partitions) * partition_unit_size
+        ref_file, _ = _do_ref_write(tmpdir, 0, file_size)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, True, True, IO_PARALLEL)
+
+        if use_cuda_pinned_tensor:
+            data_buffer = torch.zeros(file_size, dtype=torch.uint8, device='cpu').pin_memory()
+        else:
+            data_buffer = h.new_cpu_locked_tensor(file_size, torch.empty(0, dtype=torch.uint8))
+
+        file_offsets = []
+        next_offset = 0
+        for i in range(len(file_partitions)):
+            file_offsets.append(next_offset)
+            next_offset += file_partitions[i] * partition_unit_size
+
+        with open(ref_file, 'rb') as ref_fd:
+            for i in range(len(file_partitions)):
+                ref_fd.seek(file_offsets[i])
+                bytes_to_read = file_partitions[i] * partition_unit_size
+                ref_buf = list(ref_fd.read(bytes_to_read))
+
+                dst_tensor = torch.narrow(data_buffer, 0, 0, bytes_to_read)
+                assert 1 == h.sync_pread(dst_tensor, ref_file, file_offsets[i])
+                assert dst_tensor.tolist() == ref_buf
+
+        if not use_cuda_pinned_tensor:
+            h.free_cpu_locked_tensor(data_buffer)
diff --git a/tests/unit/ops/aio/test_gds.py b/tests/unit/ops/aio/test_gds.py
index e94d42cd22af8..d97eff452eb52 100644
--- a/tests/unit/ops/aio/test_gds.py
+++ b/tests/unit/ops/aio/test_gds.py
@@ -29,16 +29,21 @@ def _get_local_rank():
     return 0
 
 
-def _do_ref_write(tmpdir, index=0):
+def _do_ref_write(tmpdir, index=0, file_size=IO_SIZE):
     file_suffix = f'{_get_local_rank()}_{index}'
     ref_file = os.path.join(tmpdir, f'_py_random_{file_suffix}.pt')
-    ref_buffer = os.urandom(IO_SIZE)
+    ref_buffer = os.urandom(file_size)
     with open(ref_file, 'wb') as f:
         f.write(ref_buffer)
 
     return ref_file, ref_buffer
 
 
+def _get_file_path(tmpdir, file_prefix, index=0):
+    file_suffix = f'{_get_local_rank()}_{index}'
+    return os.path.join(tmpdir, f'{file_prefix}_{file_suffix}.pt')
+
+
 def _get_test_write_file(tmpdir, index):
     file_suffix = f'{_get_local_rank()}_{index}'
     return os.path.join(tmpdir, f'_gds_write_random_{file_suffix}.pt')
@@ -78,7 +83,7 @@ def test_parallel_read(self, tmpdir, single_submit, overlap_events):
         _validate_handle_state(h, single_submit, overlap_events)
 
         ref_file, _ = _do_ref_write(tmpdir)
-        read_status = h.sync_pread(gds_buffer, ref_file)
+        read_status = h.sync_pread(gds_buffer, ref_file, 0)
         assert read_status == 1
 
         with open(ref_file, 'rb') as f:
@@ -97,7 +102,7 @@ def test_async_read(self, tmpdir, single_submit, overlap_events):
         _validate_handle_state(h, single_submit, overlap_events)
 
         ref_file, _ = _do_ref_write(tmpdir)
-        read_status = h.async_pread(gds_buffer, ref_file)
+        read_status = h.async_pread(gds_buffer, ref_file, 0)
         assert read_status == 0
 
         wait_status = h.wait()
@@ -128,7 +133,7 @@ def test_parallel_write(self, tmpdir, single_submit, overlap_events):
 
         _validate_handle_state(h, single_submit, overlap_events)
 
-        write_status = h.sync_pwrite(gds_buffer, gds_file)
+        write_status = h.sync_pwrite(gds_buffer, gds_file, 0)
         assert write_status == 1
 
         h.unpin_device_tensor(gds_buffer)
@@ -146,7 +151,7 @@ def test_async_write(self, tmpdir, single_submit, overlap_events):
 
         _validate_handle_state(h, single_submit, overlap_events)
 
-        write_status = h.async_pwrite(gds_buffer, gds_file)
+        write_status = h.async_pwrite(gds_buffer, gds_file, 0)
         assert write_status == 0
 
         wait_status = h.wait()
@@ -188,7 +193,7 @@ def test_read(self, tmpdir, async_queue):
         _validate_handle_state(h, single_submit, overlap_events)
 
         for i in range(async_queue):
-            read_status = h.async_pread(gds_buffers[i], ref_files[i])
+            read_status = h.async_pread(gds_buffers[i], ref_files[i], 0)
             assert read_status == 0
 
         wait_status = h.wait()
@@ -225,7 +230,7 @@ def test_write(self, tmpdir, async_queue):
         _validate_handle_state(h, single_submit, overlap_events)
 
         for i in range(async_queue):
-            read_status = h.async_pwrite(gds_buffers[i], gds_files[i])
+            read_status = h.async_pwrite(gds_buffers[i], gds_files[i], 0)
             assert read_status == 0
 
         wait_status = h.wait()
@@ -268,3 +273,69 @@ def test_pin_device_tensor(self, use_new_api):
             h.free_pinned_device_tensor(pinned_buffer)
         else:
             h.unpin_device_tensor(pinned_buffer)
+
+
+@pytest.mark.parametrize('file_partitions', [[1, 1, 1], [1, 1, 2], [1, 2, 1], [2, 1, 1]])
+class TestAsyncFileOffset(DistributedTest):
+    world_size = 1
+
+    def test_offset_write(self, tmpdir, file_partitions):
+        ref_file = _get_file_path(tmpdir, '_py_random')
+        aio_file = _get_file_path(tmpdir, '_aio_random')
+        partition_unit_size = IO_SIZE
+        file_size = sum(file_partitions) * partition_unit_size
+
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, True, True, IO_PARALLEL)
+
+        gds_buffer = torch.empty(file_size, dtype=torch.uint8, device=get_accelerator().device_name())
+        h.pin_device_tensor(gds_buffer)
+
+        file_offsets = []
+        next_offset = 0
+        for i in range(len(file_partitions)):
+            file_offsets.append(next_offset)
+            next_offset += file_partitions[i] * partition_unit_size
+
+        ref_fd = open(ref_file, 'wb')
+        for i in range(len(file_partitions)):
+            src_buffer = torch.narrow(gds_buffer, 0, file_offsets[i],
+                                      file_partitions[i] * partition_unit_size).to(device='cpu')
+
+            ref_fd.write(src_buffer.numpy().tobytes())
+            ref_fd.flush()
+
+            assert 1 == h.sync_pwrite(buffer=src_buffer, filename=aio_file, file_offset=file_offsets[i])
+
+            filecmp.clear_cache()
+            assert filecmp.cmp(ref_file, aio_file, shallow=False)
+
+        ref_fd.close()
+
+        h.unpin_device_tensor(gds_buffer)
+
+    def test_offset_read(self, tmpdir, file_partitions):
+        partition_unit_size = BLOCK_SIZE
+        file_size = sum(file_partitions) * partition_unit_size
+        ref_file, _ = _do_ref_write(tmpdir, 0, file_size)
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, True, True, IO_PARALLEL)
+
+        gds_buffer = torch.empty(file_size, dtype=torch.uint8, device=get_accelerator().device_name())
+        h.pin_device_tensor(gds_buffer)
+
+        file_offsets = []
+        next_offset = 0
+        for i in range(len(file_partitions)):
+            file_offsets.append(next_offset)
+            next_offset += file_partitions[i] * partition_unit_size
+
+        with open(ref_file, 'rb') as ref_fd:
+            for i in range(len(file_partitions)):
+                ref_fd.seek(file_offsets[i])
+                bytes_to_read = file_partitions[i] * partition_unit_size
+                ref_buf = list(ref_fd.read(bytes_to_read))
+
+                dst_tensor = torch.narrow(gds_buffer, 0, 0, bytes_to_read)
+                assert 1 == h.sync_pread(dst_tensor, ref_file, file_offsets[i])
+                assert dst_tensor.tolist() == ref_buf
+
+        h.unpin_device_tensor(gds_buffer)
diff --git a/tests/unit/ops/transformer/inference/test_rope.py b/tests/unit/ops/transformer/inference/test_rope.py
new file mode 100644
index 0000000000000..1f0ca0578e04a
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_rope.py
@@ -0,0 +1,38 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.accelerator import get_accelerator
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("num_heads", [64, 32, 16, 8])
+def test_rope_warp_size_alignment(num_heads):
+    if get_accelerator().device_name() != "cuda":
+        pytest.skip("This test runs only on GPU")
+
+    batch = 1
+    head = 8
+    seq_len = 1024
+    head_dim = 32
+    rotary_dim = 32
+    offset = 8
+    rotate_half = False
+    rope_theta = 2
+
+    cuda0 = torch.device('cuda:0')
+    query = torch.randn(batch, head, seq_len, head_dim, device=cuda0)
+    key = torch.randn(batch, head, seq_len, head_dim, device=cuda0)
+
+    inference = InferenceBuilder().load()
+    # For num_heads values of 64, 32, 16, 8
+    # corresponding threads_per_head (defined in apply_rotary_pos_emb.cu) values are 4, 8, 16, 32
+    inference.apply_rotary_pos_emb(query, key, rotary_dim, offset, num_heads, rotate_half, rope_theta)
diff --git a/tests/unit/runtime/test_no_sync_ctxt.py b/tests/unit/runtime/test_no_sync_ctxt.py
new file mode 100644
index 0000000000000..8c64970138091
--- /dev/null
+++ b/tests/unit/runtime/test_no_sync_ctxt.py
@@ -0,0 +1,197 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+
+from contextlib import nullcontext
+import torch
+
+from unit.simple_model import SimpleModel, random_dataloader
+from unit.common import DistributedTest
+
+import deepspeed
+import deepspeed.comm as dist
+from deepspeed.utils import safe_get_full_grad
+
+
+class TestNoSyncCtxt(DistributedTest):
+    world_size = 2
+
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+    @pytest.mark.parametrize("zero_stage", [0, 1, 2, 3])
+    def test_zero_stage(self, zero_stage, dtype):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            },
+        }
+
+        invalid_cfg = zero_stage > 1
+        if dtype == torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+        elif dtype == torch.float16:
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+
+        hidden_dim = 64
+        total_samples = 32
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=total_samples,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
+        dist.barrier()
+
+        with pytest.raises(AssertionError) if invalid_cfg else nullcontext() as assertinfo:
+            with model.no_sync():
+                for _, batch in enumerate(data_loader):
+                    loss = model(batch[0], batch[1])
+                    model.backward(loss)
+        if invalid_cfg:
+            assert ("no_sync context manager is incompatible" in str(assertinfo))
+
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+    @pytest.mark.parametrize("zero_stage", [0, 1])
+    def test_engine_step(self, zero_stage, dtype):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            },
+        }
+
+        if dtype == torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+        elif dtype == torch.float16:
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+
+        hidden_dim = 64
+        total_samples = 32
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=total_samples,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
+        dist.barrier()
+
+        with model.no_sync():
+            for _, batch in enumerate(data_loader):
+                loss = model(batch[0], batch[1])
+                model.backward(loss)
+                with pytest.raises(AssertionError) as assertinfo:
+                    model.step()
+                assert ("It is illegal to call Engine.step() inside no_sync context manager" in str(assertinfo))
+
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+    @pytest.mark.parametrize("zero_stage", [0, 1])
+    def test_multiple_ctxts(self, zero_stage, dtype):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            },
+        }
+
+        if dtype == torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+        elif dtype == torch.float16:
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+
+        hidden_dim = 64
+        total_samples = 32
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=total_samples,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
+        dist.barrier()
+
+        param_list = list(model.parameters())
+        first_losses = []
+        first_grad_norms = []
+        with model.no_sync():
+            for _, batch in enumerate(data_loader):
+                loss = model(batch[0], batch[1])
+                first_losses.append(loss.item())
+                model.backward(loss)
+                grad_norm = sum([safe_get_full_grad(p).norm() for p in param_list])
+                first_grad_norms.append(grad_norm.item())
+
+        second_losses = []
+        second_grad_norms = []
+
+        model.zero_grad()
+        with model.no_sync():
+            for _, batch in enumerate(data_loader):
+                loss = model(batch[0], batch[1])
+                second_losses.append(loss.item())
+                model.backward(loss)
+                grad_norm = sum([safe_get_full_grad(p).norm() for p in param_list])
+                second_grad_norms.append(grad_norm.item())
+
+        assert len(first_losses) == len(second_losses)
+        for x, y in zip(first_losses, second_losses):
+            assert x == y
+
+        assert len(first_grad_norms) == len(second_grad_norms)
+        for x, y in zip(first_grad_norms, second_grad_norms):
+            assert x == y
+
+    def test_reentry(self):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "zero_optimization": {
+                "stage": 1,
+            },
+        }
+
+        hidden_dim = 64
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        dist.barrier()
+
+        with model.no_sync():
+            with pytest.raises(AssertionError) as assertinfo:
+                with model.no_sync():
+                    pass
+            assert ("no_sync context manager reentry is unsupported" in str(assertinfo))
diff --git a/tests/unit/runtime/zero/test_zero.py b/tests/unit/runtime/zero/test_zero.py
index 7262a1b2c998d..5dffd70aab687 100644
--- a/tests/unit/runtime/zero/test_zero.py
+++ b/tests/unit/runtime/zero/test_zero.py
@@ -1628,3 +1628,48 @@ def test_empty_param_groups(self, dtype, use_client_optimizer, empty_weight_grou
             optimizer=optimizer,
             config=config_dict,
         )
+
+
+class TestZero3SwitchModes(DistributedTest):
+    world_size = 2
+
+    @pytest.mark.parametrize("prefetch_ratio", [0.0, 0.5, 1.0])
+    def test(self, prefetch_ratio, zero_stage=3):
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        prefetch_bucket_size = int(sum([p.numel() for p in model.parameters(recurse=True)]) * prefetch_ratio)
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 2,
+            "gradient_accumulation_steps": 2,
+            "zero_optimization": {
+                "stage": zero_stage,
+                "stage3_prefetch_bucket_size": prefetch_bucket_size
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            }
+        }
+
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
+
+        for _ in range(3):
+            model.train()
+            for batch in data_loader:
+                loss = model(batch[0], batch[1])
+                model.backward(loss)
+                model.step()
+
+            model.eval()
+            with torch.no_grad():
+                for batch in data_loader:
+                    loss = model(batch[0], batch[1])
diff --git a/tests/unit/runtime/zero/test_zero_context.py b/tests/unit/runtime/zero/test_zero_context.py
index ec9e9e94aeafe..1d4fcd60022c2 100644
--- a/tests/unit/runtime/zero/test_zero_context.py
+++ b/tests/unit/runtime/zero/test_zero_context.py
@@ -218,9 +218,9 @@ def test_throughput_calculation(self):
             engine.tput_timer.stop(global_step=global_step)
             duration = engine.tput_timer.end_time - engine.tput_timer.start_time
             # step elapsed time is reset after gradient accumulation steps
-            assert engine.tput_timer.step_elapsed_time == (
-                0 if engine.tput_timer.global_step_count != engine.tput_timer.start_step else current_duration +
-                duration)
+            assert engine.tput_timer.step_elapsed_time == (0 if engine.tput_timer.global_step_count
+                                                           != engine.tput_timer.start_step else current_duration +
+                                                           duration)
             assert engine.tput_timer.total_elapsed_time == total_duration + duration
 
     def test_ext_param_getattr(self):
diff --git a/tests/unit/runtime/zero/test_zero_leaf_module.py b/tests/unit/runtime/zero/test_zero_leaf_module.py
index 1d3b88a04a4e5..74c7098836454 100644
--- a/tests/unit/runtime/zero/test_zero_leaf_module.py
+++ b/tests/unit/runtime/zero/test_zero_leaf_module.py
@@ -3,6 +3,7 @@
 
 # DeepSpeed Team
 
+import pytest
 import deepspeed.comm as dist
 import torch
 
@@ -12,6 +13,8 @@
 import deepspeed
 from deepspeed.utils import set_z3_leaf_modules, unset_z3_leaf_modules, get_z3_leaf_modules, z3_leaf_module
 from deepspeed.accelerator import get_accelerator
+from torch import nn
+import time
 
 
 class ChooseModuleByCounter(torch.nn.Module):
@@ -53,6 +56,49 @@ def forward(self, x, y):
         return x, loss
 
 
+class MLPBlock(nn.Module):
+
+    def __init__(self, hidden_dim):
+        super(MLPBlock, self).__init__()
+        self.gate_proj = nn.Linear(hidden_dim, hidden_dim, bias=False)
+        self.up_proj = nn.Linear(hidden_dim, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, hidden_dim, bias=False)
+        self.act_fn = nn.GELU()
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+class FineGrainedBlock(nn.Module):
+
+    def __init__(self, hidden_dim, num_block):
+        super(FineGrainedBlock, self).__init__()
+        self.num_block = num_block
+        self.mlp_layers = torch.nn.ModuleList([MLPBlock(hidden_dim=hidden_dim) for _ in range(self.num_block)])
+
+    def forward(self, x):
+        for i in range(self.num_block):
+            x = self.mlp_layers[i](x)
+        return x
+
+
+class modelWithFineGrainedBlock(nn.Module):
+
+    def __init__(self, hidden_dim, num_block):
+        super(modelWithFineGrainedBlock, self).__init__()
+        self.coarse_grained_layer1 = nn.Linear(hidden_dim, 8 * hidden_dim)
+        self.coarse_grained_layer2 = nn.Linear(8 * hidden_dim, hidden_dim)
+        self.fine_grained_layer = FineGrainedBlock(hidden_dim, num_block)
+        self.cel = torch.nn.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        x = self.coarse_grained_layer1(x)
+        x = self.coarse_grained_layer2(x)
+        x = self.fine_grained_layer(x)
+        loss = self.cel(x, y)
+        return x, loss
+
+
 def run_model(model, config_dict, hidden_dim, dtype, requires_grad):
     model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
     data_loader = random_dataloader(model=model,
@@ -97,9 +143,9 @@ def _test_set_z3_leaf_modules(self, cls, requires_grad):
                 "stage3_max_reuse_distance": 0,
             }
         }
-        if get_accelerator().is_fp16_supported():
+        if preferred_dtype() is torch.float16:
             config_dict["fp16"] = {"enabled": True}
-        elif get_accelerator().is_bf16_supported():
+        elif preferred_dtype() is torch.bfloat16:
             config_dict["bf16"] = {"enabled": True}
 
         model = cls(hidden_dim)
@@ -143,3 +189,74 @@ def test_set_no_match_class(self):
             raise AssertionError("Expected error that no module is set as a leaf module")
         except ValueError as e:
             pass
+
+
+@pytest.mark.parametrize("module_granularity_threshold", [0, 100, 12100, 10000000])
+class TestZ3LeafOptimization(DistributedTest):
+    world_size = 2
+    reuse_dist_env = True
+
+    def test_finegrained_optimization(self, module_granularity_threshold: int):
+        hidden_dim = 128
+        num_block = 16
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-6
+                }
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_prefetch_bucket_size": hidden_dim**2,
+                "stage3_param_persistence_threshold": 0,
+                "stage3_max_reuse_distance": 0,
+            }
+        }
+        if preferred_dtype() is torch.float16:
+            config_dict["fp16"] = {"enabled": True}
+        elif preferred_dtype() is torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+
+        def bench_loss_and_time(config):
+            warm_up_step = 10
+            model = modelWithFineGrainedBlock(hidden_dim, num_block)
+            model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config)
+            data_loader = random_dataloader(model=model,
+                                            total_samples=20,
+                                            hidden_dim=hidden_dim,
+                                            device=model.device,
+                                            dtype=preferred_dtype())
+            dist.barrier()
+            loss_list = []
+
+            for i, batch in enumerate(data_loader):
+                if i == warm_up_step:
+                    dist.barrier()
+                    get_accelerator().synchronize()
+                    start_time = time.time()
+                batch[0].requires_grad = True
+                loss = model(batch[0], batch[1])
+                loss = loss[1]
+                loss_list.append(loss)
+                model.backward(loss)
+                model.step()
+            get_accelerator().synchronize()
+            end_time = time.time()
+            duration = end_time - start_time
+            model.destroy()
+            return loss_list, duration
+
+        baseline_loss_list, baseline_exec_time = bench_loss_and_time(config_dict)
+
+        config_dict["zero_optimization"]["stage3_module_granularity_threshold"] = module_granularity_threshold
+        loss, duration = bench_loss_and_time(config_dict)
+
+        if dist.get_rank() == 0:
+            print(f"baseline exec time:", baseline_exec_time)
+            print(
+                f"finegrained optimziation exec time: {duration},granularity threshold:{module_granularity_threshold} "
+            )
+            assert baseline_loss_list == loss, f"incorrect loss value with threshold:{module_granularity_threshold}"
diff --git a/tests/unit/runtime/zero/test_zero_multiple_run.py b/tests/unit/runtime/zero/test_zero_multiple_run.py
new file mode 100644
index 0000000000000..d4eb3a578cc96
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero_multiple_run.py
@@ -0,0 +1,53 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import deepspeed
+import torch
+from unit.common import DistributedTest, preferred_dtype
+from unit.simple_model import SimpleModel, random_dataloader
+
+
+class TestZ3MultipleModelCall(DistributedTest):
+    world_size = 1
+
+    def test_z3_multiple_model_call(self):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "steps_per_print": 1,
+            "zero_optimization": {
+                "stage": 3
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+        }
+        if preferred_dtype() is torch.float16:
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif preferred_dtype() is torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+        hidden_dim, nlayers = 2048, 3
+        model = SimpleModel(hidden_dim=hidden_dim, nlayers=nlayers)
+        model_engine, _, _, _ = deepspeed.initialize(config=config_dict,
+                                                     model=model,
+                                                     model_parameters=model.parameters())
+        data_loader = iter(
+            random_dataloader(model=model_engine, total_samples=10, hidden_dim=hidden_dim, device=model_engine.device))
+
+        for n, batch in enumerate(data_loader):
+            loss1 = model_engine(batch[0], batch[1])
+            with torch.no_grad():
+                loss2 = model_engine(batch[0], batch[1])
+            loss = loss1 + loss2
+            model_engine.backward(loss)
+            for name, submodule in model_engine.module.linears._modules.items():
+                assert hasattr(submodule, "ds_grads_remaining"), \
+                  f"linears.{name} does not have variable ds_grads_remaining"
+                assert submodule.ds_grads_remaining == 0, \
+                  f"ds_grads_remaining of linears.{name} is not 0 ({submodule.ds_grads_remaining})"
+            model_engine.step()
diff --git a/tests/unit/sequence_parallelism/test_ulysses.py b/tests/unit/sequence_parallelism/test_ulysses.py
index 915c89e0b00a9..d9ed54322d5c2 100644
--- a/tests/unit/sequence_parallelism/test_ulysses.py
+++ b/tests/unit/sequence_parallelism/test_ulysses.py
@@ -11,9 +11,12 @@
 from unit.common import DistributedTest
 from deepspeed.sequence.layer import _SeqAllToAll
 from unit.util import skip_on_arch
+from unit.simple_model import *
+from deepspeed.utils import groups
+from deepspeed.module_inject.tp_shard import get_shard_size_list
+#Use mesh device to create data and sequence parallel group
 
 
-#Use mesh device to create data and sequence parallel group
 class TestUlyssesUtils(DistributedTest):
     world_size = 4
 
@@ -75,3 +78,82 @@ def test_alltoall_output_consistency(self, d0: int, d1: int, head_dim: int, num_
         # Check outputs are the same as input
         for i in range(1, len(outputs)):
             assert torch.allclose(input_tensor, outputs[i]), f"Outputs differ for sequence dim {seq_dims[i]}"
+
+
+@pytest.mark.parametrize("d0", [2, 4])  #batch or sequence dimension
+@pytest.mark.parametrize("d1", [4, 8])  #batch or sequence dimension
+@pytest.mark.parametrize("num_heads", [3, 7])
+@pytest.mark.parametrize("head_dim", [16])
+class TestUlyssesAll2All_odd(DistributedTest):
+    world_size = 4
+
+    def test_alltoall_output_consistency(self, d0: int, d1: int, head_dim: int, num_heads: int) -> None:
+
+        data_parallel_size = 2
+        seq_parallel_size = self.world_size // data_parallel_size
+        skip_on_arch(min_arch=8)
+
+        def seq_batch_heads_hash(d0, d1, h, offset_d0=0, offset_d1=0, offset_h=0):
+            d0 += offset_d0
+            d1 += offset_d1
+            h += offset_h
+            return d0 * 10 + h + d1 * 0.1
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+        ds_engine, _, _, _ = initialize(model=model,
+                                        config_params={"train_batch_size": 8},
+                                        mesh_param=(data_parallel_size, seq_parallel_size))
+
+        scatter_idx = 2
+        outputs = []
+        inputs = []
+        batch_dims = [0, 1]
+        seq_dims = [1, 0]
+
+        for idx, seq_dim in enumerate(seq_dims):
+            gather_idx = seq_dim
+            batch_dim_idx = batch_dims[idx]
+
+            #4D tensor : b,s,h,d or s,b,h,d
+            #create a hash tensor from pos_id, head_id, and batch_id
+            d0_indices = torch.arange(d0).reshape(-1, 1, 1, 1)
+            d1_indices = torch.arange(d1).reshape(1, -1, 1, 1)
+            h_indices = torch.arange(num_heads).reshape(1, 1, -1, 1)
+            input_tensor = torch.randn(d0, d1, num_heads, head_dim, device=ds_engine.device)
+            if batch_dim_idx == 1:  #seq_len_dim : 0(d0)
+                input_tensor[:] = seq_batch_heads_hash(d0_indices, d1_indices, h_indices,
+                                                       d0 * groups._get_sequence_parallel_rank(), 0)
+            elif batch_dim_idx == 0:  #seq_len_dim : 1(d1)
+                input_tensor[:] = seq_batch_heads_hash(d0_indices, d1_indices, h_indices, 0,
+                                                       d1 * groups._get_sequence_parallel_rank())
+            inputs.append(input_tensor)
+
+            ### first all2all: sequence parallel to head parallel
+            s2h_tensor = _SeqAllToAll.apply(ds_engine.seq_parallel_group, input_tensor, scatter_idx, gather_idx,
+                                            batch_dim_idx)
+
+            # s2h_tensor check for the first all2all: compare with the expected ground truth
+            d0_indices = torch.arange(s2h_tensor.shape[0]).reshape(-1, 1, 1, 1)
+            d1_indices = torch.arange(s2h_tensor.shape[1]).reshape(1, -1, 1, 1)
+            h_indices = torch.arange(s2h_tensor.shape[2]).reshape(1, 1, -1, 1)
+            shard_list = get_shard_size_list(num_heads, groups._get_sequence_parallel_world_size())
+            head_offset = sum(shard_list[:groups._get_sequence_parallel_rank()])
+            s2h_truth = torch.zeros_like(s2h_tensor)
+            s2h_truth[:] = seq_batch_heads_hash(d0_indices, d1_indices, h_indices, 0, 0, head_offset)
+
+            assert torch.allclose(s2h_truth,
+                                  s2h_tensor), f"s2h_tensor differs from the expected for sequence dim: {seq_dim}"
+            #No op
+            ### second all2all: head parallel to sequence parallel
+            h2s_tensor = _SeqAllToAll.apply(ds_engine.seq_parallel_group, s2h_tensor, gather_idx, scatter_idx,
+                                            batch_dim_idx)
+            print(
+                f'[{dist.get_rank()}] s={seq_dim} input: {input_tensor.shape} s2h: {s2h_tensor.shape} h2s_tensor: {h2s_tensor.shape}'
+            )
+            outputs.append(h2s_tensor)
+
+        # Check outputs for the second all2all
+        for i in range(0, len(outputs)):
+            assert torch.allclose(inputs[i],
+                                  outputs[i]), f"[{dist.get_rank()}]Outputs differ for sequence dim {seq_dims[i]}"
diff --git a/tests/unit/simple_model.py b/tests/unit/simple_model.py
index 3357c200bd68c..a5538a8c6e689 100644
--- a/tests/unit/simple_model.py
+++ b/tests/unit/simple_model.py
@@ -79,7 +79,7 @@ def forward(self, x, y, **kwargs):
 
 class SimpleMoEModel(torch.nn.Module):
 
-    def __init__(self, hidden_dim, num_experts=4, ep_size=1, use_residual=False):
+    def __init__(self, hidden_dim, num_experts=4, ep_size=1, use_residual=False, use_rts=True):
         super(SimpleMoEModel, self).__init__()
         self.linear1 = torch.nn.Linear(hidden_dim, hidden_dim)
         expert = torch.nn.Sequential(torch.nn.Linear(hidden_dim, hidden_dim), torch.nn.Linear(hidden_dim, hidden_dim))
@@ -89,7 +89,8 @@ def __init__(self, hidden_dim, num_experts=4, ep_size=1, use_residual=False):
                          ep_size=ep_size,
                          use_residual=use_residual,
                          num_experts=num_experts,
-                         k=1)
+                         k=1,
+                         use_rts=use_rts)
         # interleaving MoE modules with dense to create an opportunity
         # for gradients to be merged in ZeRO stage 2 average_tensor reduce bucket
         self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
@@ -98,7 +99,8 @@ def __init__(self, hidden_dim, num_experts=4, ep_size=1, use_residual=False):
                          ep_size=ep_size,
                          use_residual=use_residual,
                          num_experts=num_experts,
-                         k=1)
+                         k=1,
+                         use_rts=use_rts)
         self.linear3 = torch.nn.Linear(hidden_dim, hidden_dim)
         self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
 
diff --git a/version.txt b/version.txt
index 1985d91413de8..2a0970ca757cd 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.15.3
+0.16.1