diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
index ac6266121f54..a168af277fb8 100644
--- a/.github/workflows/formatting.yml
+++ b/.github/workflows/formatting.yml
@@ -27,12 +27,13 @@ jobs:
           which python
           python --version
 
-      - name: Install deepspeed
+      - name: Install dependencies
         run: |
-          pip install .[dev,autotuning,triton]
-          ds_report
+          # Previously we would do pip install .[dev] but this is causing out of
+          # space errors start with torch 2.1.0 release
+          grep -E "clang-format|pre-commit" requirements/requirements-dev.txt | xargs pip install
 
       - name: Formatting checks
         run: |
-           pip show pre-commit clang-format
-           pre-commit run --all-files
+          pip show pre-commit clang-format
+          pre-commit run --all-files
diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml
index fce87a849cfe..c90b7c72ac0b 100644
--- a/.github/workflows/nv-ds-chat.yml
+++ b/.github/workflows/nv-ds-chat.yml
@@ -27,7 +27,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip3 install -U --cache-dir $TORCH_CACHE torch
+          pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
index 341dea57b663..75d2dc732d4d 100644
--- a/.github/workflows/nv-lightning-v100.yml
+++ b/.github/workflows/nv-lightning-v100.yml
@@ -26,7 +26,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml
index a8c5a9afefcc..2b101b023d67 100644
--- a/.github/workflows/nv-mii.yml
+++ b/.github/workflows/nv-mii.yml
@@ -26,7 +26,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip3 install -U --cache-dir $TORCH_CACHE torch
+          pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
index 544fb50acec3..2b91df3ae44c 100644
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -26,7 +26,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -52,8 +52,8 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.0" --cuda_ver="11.7"
-          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.0" --cuda_ver="11.7"
+          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.1" --cuda_ver="11.8"
+          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.1" --cuda_ver="11.8"
 
       - name: Coverage report
         run: |
diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml
index c2bf5919f20d..d0df6e546982 100644
--- a/.github/workflows/nv-torch-nightly-v100.yml
+++ b/.github/workflows/nv-torch-nightly-v100.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116
+          pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml
index 4e5a34365f52..1cc0c6588610 100644
--- a/.github/workflows/nv-transformers-v100.yml
+++ b/.github/workflows/nv-transformers-v100.yml
@@ -27,7 +27,7 @@ jobs:
       - name: Install pytorch
         run: |
           # use the same pytorch version as transformers CI
-            pip install -U --cache-dir $TORCH_CACHE torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu118 --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 000000000000..093bc98d41e8
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,47 @@
+name: Build and publish DeepSpeed release
+
+on:
+  push:
+    tags:
+      - 'v*.*.*'
+
+jobs:
+  deploy:
+    runs-on: ubuntu-20.04
+    environment: release-env
+
+    steps:
+    - uses: actions/checkout@v3
+      with:
+        ref: "master"
+    - name: Get release version from tag
+      run: |
+        echo "RELEASE_VERSION=${GITHUB_REF#refs/*/v}" >> $GITHUB_ENV
+    - name: Check release version
+      run: |
+        python release/check_release_version.py --release_version ${{ env.RELEASE_VERSION }}
+    - name: Build DeepSpeed
+      run: |
+        DS_BUILD_STRING=" " python setup.py sdist_wheel
+    - name: Publish to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.PYPI_API_TOKEN }}
+        repository-url: https://upload.pypi.org/legacy/
+    - name: Bump version
+      run: |
+        python release/bump_patch_version.py --current_version ${{ env.RELEASE_VERSION }}
+    - name: Create Pull Request
+      uses: peter-evans/create-pull-request@v4
+      with:
+        token: ${{ secrets.GH_PAT }}
+        add-paths: |
+          version.txt
+        body: |
+          **Auto-generated PR to update version.txt after a DeepSpeed release**
+          Released version - ${{ env.RELEASE_VERSION }}
+          Author           - @${{ github.actor }}
+        branch: AutoPR/${{ env.RELEASE_VERSION }}
+        assignees: ${{ github.actor }}
+        title: "Update version.txt after ${{ env.RELEASE_VERSION }} release"
+        author: ${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>
diff --git a/README.md b/README.md
index e4560979b124..abf20667882b 100755
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@
 ## Latest News
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
-* [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md)
+* [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md)]
 * [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[DeepSpeed4Science website](https://deepspeed4science.ai/)] [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]
 * [2023/08] [DeepSpeed ZeRO-Inference: 20X faster inference through weight quantization and KV cache offloading](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md)
 * [2023/08] [DeepSpeed-Chat: Llama/Llama-2 system support, efficiency boost, and training stability improvements](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31/README.md)
diff --git a/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md b/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md
new file mode 100644
index 000000000000..e91ff1ecd51e
--- /dev/null
+++ b/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md
@@ -0,0 +1,181 @@
+
+<div align="center">
+
+# DeepSpeed-VisualChat：多轮图像+文字，为你展现不一样的AI聊天魅力
+
+</div>
+
+<div align="center">
+
+<img src="../assets/images/hero-figure.png" width="1000px" alt="DeepSpeed-VisualChat!"/>
+
+</div>
+
+要引用 DeepSpeed-VisualChat，请引用我们的 [arxiv 报告](https://arxiv.org/abs/2309.14327)：
+
+
+```
+@article{yao2023deepspeed-visualchat,
+  title={{DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention}},
+  author={Zhewei Yao and Xiaoxia Wu and Conglong Li and Minjia Zhang and Heyang Qin and Olatunji Ruwase and Ammar Ahmad Awan and Samyam Rajbhandari and Yuxiong He},
+  journal={arXiv preprint arXiv:2309.14327},
+  year={2023}
+}
+```
+
+# 1. 概述
+大型语言模型 (LLMs)，如 GPT 和 LLaMa，在各种文本生成和理解任务中都展现出了卓越的能力，特别是在经过零次/少次学习（zero-/few-shot learning）或微调（instructed fine-tuning）后。然而，要让 AI 模型为多样化的任务做好准备，需要加入的一个关键特性是多模态能力；例如，AI 模型应该能够读取图像、听到声音、观看视频等。这种能力在纯文本基础的 LLMs 中基本上是不存在的。
+
+最近，大量的研究项目开始探索将视觉能力引入到 LLMs 中，特别是通过插入图片输入使 LLMs 来理解图片（简称为大型视觉语言模型或 LVLMs）。
+
+大多数现有工作的主要缺点是：
+* 重点主要放在与单一图像相关的任务上，如视觉问题回答和字幕，或处理需要同时输入的多个图像。两种方法都不太擅长管理交错的图像和文本输入。
+* 系统的可扩展性仅限于具有约 10B 参数的模型，这比最大的开源模型小了一个数量级。
+
+然而，对于一个真正的 AI 聊天模型，输入内容可能是与文本交错的多个图像，这是目前的工作很少涉及的情况。此外，随着模型大小的增加，LLMs 的生成能力增长迅速。因此，将系统能力集中在约 10B 的模型上限制了对 LVLMs 潜力的进一步探索。
+
+为了解决这些问题，我们推出了 DeepSpeed-VisualChat（请参阅 [arxiv 报告](https://arxiv.org/abs/2309.14327) 以获取更多详细信息），带有以下新特性：
+
+* ***全开源多轮多图框架与前所未有的可扩展性***：DeepSpeed-VisualChat，作为开创性的全开源框架之一，支持多轮和多图对话，容纳交错的文本和图像输入。我们利用 DeepSpeed 提高我们的训练效果，使用一个 2B 的视觉编码器和一个 70B 的 LLaMA-2 解码器模型，展示了我们框架的显著可扩展性。
+* ***多模态因果注意力 (MMCA)*** 我们为多模态模型设计了一个新的 MMCA 注意力机制，独立地计算各种模态的注意力权重。MMCA 达到了与传统交叉注意机制类似的目标，但为生成任务提供了增强的因果注意解释，消除了对额外模块或参数的需求。与标准的因果注意力相比，它还提供了更好的训练数据效率。
+* ***交错输入的数据混合*** 为了促进交错模态的对话，DeepSpeed-VisualChat 在现有数据集上采用了各种数据混合技术，克服了大多数现有开源数据集中交错文本和图像输入的短缺。
+
+# 2. 模型架构概述
+<div align="center">
+  <img src="../assets/images/model.png" alt="模型结构" width="400"/>
+
+  *图 1：DeepSpeed-VisualChat 的模型架构示意图。*
+</div>
+
+如 *图 1* 所示，DeepSpeed-VisualChat 的模型架构由三个部分组成：一个视觉编码器，如 CLIP；一个语言解码器，如 LLaMa-7B；和一个特征对齐线性投影层。模型的大部分都是冻结的，只有语言模型的嵌入和线性投影层是可训练的。因此，可训练参数的总数大约在 O(10M) (LLaMa-2-13B) 到 O(100M) (LLaMa-2-70B) 之间。
+
+# 3. DeepSpeed 多模态因果注意力
+
+用于在多模态模型中连接视觉和文本组件的两种常见注意机制是：因果注意力，如在 MiniGPT 和 QWen-VL 中使用的，以及交叉注意力，如在 Otter 和 Flamingo 中使用的。
+
+<div align="center">
+  <img src="../assets/images/attention.png" alt="不同的注意机制" width="1000"/>
+
+  *图 2：不同的注意机制：使用一个输入句子“用户：请描述这个图片。”和三个图像令牌（I-token1、I-token2、I-token3）来比较不同的注意机制。在左边，我们展示了标准的因果注意力，将图像令牌视为文本。在中间，我们展示了应用于图像的交叉注意力，同时保持文本令牌的标准因果注意力。在右边，我们展示了我们的创新 MMCA 注意力机制，其中图像令牌只执行自注意，文本令牌独立地注意文本/图像令牌，橙色为图像部分。这种机制由：softmax($`QK^T \odot M_1`$)+ softmax($`QK^T \odot M_2`$) 定义，其中 Q 和 K 分别为查询和密钥，$`M_1`$=[M==1]，和 $`M_2`$=[M==2]，其中 M $`\in`$ R<sup>10x10</sup>。*
+</div>
+
+<b>因果注意力 (CA)</b>：基于 CA 的方法简单地将视觉特征（即来自最终视觉编码器层输出的特征）投影到文本特征，并将它们与文本嵌入层后的正常文本特征组合，以送入 LLMs。CA 的好处是它是 LLMs 原始注意机制的自然扩展，因此，它不引入任何额外的模块或参数。但是，直觉上这种方法会带来一些问题：
+
+* 每个视觉令牌会关注它之前的视觉和文本令牌。然而视觉令牌已经以双向方式完全编码，不需要进一步关注它之前的视觉和文本令牌。
+* 对于一个文本令牌，模型需要学习如何在其之前的文本和图像令牌之间分配其注意权重。由于这些问题，我们发现 LVLMs 中 CA 的数据效率通常是有问题的。为了解决这个问题，LLaVA 和 QWen-VL 需要视觉-语言预训练来完全对齐视觉特征和文本特征。
+
+<b>交叉注意力 (CrA)</b>：作为替代方案，交叉注意力 (CrA) 与 CA 的结合展示出更好的数据效率，但也带有一些缺点：
+
+* 它为模型引入了新的参数。例如，具有交叉注意力引入的新参数的 Otter 拥有超过 15 亿的可训练参数。和 LLaVA 的百万级可训练参数相比，这大大增加了训练成本和内存需求。
+* 如果在训练过程中中间引入了一个图像，需要仔细设计，因为先前的文本令牌不应该能够注意到图像。
+
+<b>多模态因果注意机制 (MMCA)</b>：为了克服这些问题，我们提出了一种新的多模态因果注意机制 (MMCA)，它既有 CA 的参数效率，又有 CrA 的数据效率。总体思路如下：
+
+* 对于视觉令牌，它们只关注自己，因为视觉令牌是由视觉编码器编码的。
+* 对于文本令牌，它们关注所有以前的令牌。但是，对文本和图像令牌 MMCA 使用两个单独的注意权重矩阵。
+
+MMCA 的第二点背后的直觉是，一个模态的注意权重可能会影响另一个模态。例如，文本令牌可能会比视觉信息更多地关注文本信息。因此，如果注意权重矩阵在两种模态之间进行归一化，那么视觉令牌的注意得分可能会非常小。请参考 *图 2* 以查看三种注意机制的可视化。
+
+<b>演示结果。</b>我们首先通过几个例子展示在不同的注意机制下 DeepSpeed-VisualChat 的单图像视觉语言对话功能。在这些实验中，我们使用 LLaMA2-7B 语言模型和 QWen-VL 视觉编码器作为我们的视觉编码器。这两个模型通过一个简单的线性投影层连接在一起。这个模型在两个 LLaVa 数据集上进行了训练。正如 *图 3* 和 *图 4* 所示，当与 MMCA 配合使用时，DeepSpeed-VisualChat 有效地识别了图像中的视觉细节，对用户的问题提供了准确通顺的回答。
+此外，与其他注意机制（如使用因果注意力和交叉注意力的组合）相比，MMCA 表现出更全面和精确的图像细节把握。与 CrA 和 CA 的组合以及 MMCA 相比，仅使用 CA 可能会显示出稍微多一些的错误（*图 3*）或导致较低的理解能力（*图 4*）。
+
+<div align="center">
+  <img src="../assets/images/cat-chat.png" alt="小猫咪" width="600"/>
+
+  *图 3：示例视觉和语言输入，显示了（1）标准因果注意力 (CA) （2）与交叉注意力组合的标准因果注意力 (CA+ CrA) 和（3）DeepSpeed-VisualChat 中的特殊多模态因果注意力 (MMCA) 之间的输出比较。*
+</div>
+
+<div align="center">
+  <img src="../assets/images/lake-chat.png" alt="美丽的湖泊" width="600"/>
+
+  *图 4：DeepSpeed-VisualChat 准确地识别了场景是一个美丽的湖泊，并提供了一组合理的建议。相比之下，其他的注意力机制误解了图像认为其包含“带船坡的码头”。*
+</div>
+
+# 4. 数据混合
+我们使用了 3 个来源的 9 个数据集，如我们的 [arxiv 报告](https://arxiv.org/abs/2309.14327) 所述。一个实现多轮和多图对话的关键缺失元素是没有足够的数据。我们找到的唯一的多轮多图数据来源是 SparklesDialogue 数据集，它只包含 6520 个样本。为了解决这个问题，我们采用了两种方法，从现有的单图或单轮数据中合成多轮多图数据：简单的数据连接和 LLaVA-Otter 数据混合。
+
+## 4.1 简单数据连接
+对于 LLaVA 模型使用的 "llava" 和 "llava_dial" 数据集，每个样本包括单图像的单轮/多轮对话。为了模拟用户依次询问多个图像的情况，我们对这两个数据集进行了简单的数据后处理。具体来说，我们随机将不同数量的样本连接成一个样本。在 "llava" 的情况下，我们连接了 1 到 3 个样本，而在 "llava_dial" 的情况下，我们连接了 1 到 2 个样本。
+
+## 4.2 LLaVA-Otter 数据混合
+我们注意到，LLaVA 模型使用的 llava 和 llava_dial 数据集以及 Otter 模型使用的 otter_mimicit_cgd 数据集都使用了 COCO train2017 图像。对于 llava 和 llava_dial 数据集，每个样本包括一个图像的单轮/多轮对话。对于 otter_mimicit_cgd 数据集，每个样本包括一对图像的单轮对话。这使我们能够构建一个合成的多轮多图数据 llava_otter_blend 作为更自然的混合：对于 otter_mimicit_cgd 数据集中的每个样本，我们寻找使用相同图像的 llava 和 llava_dial 样本，然后以 "llava/llava_dial 对话然后 otter_mimicit_cgd 对话" 的方式构建一个新样本。
+
+<div align="center">
+  <img src="../assets/images/data-blending.png" alt="朋友们" width="600"/>
+
+  *图 5：经过 LLaVA-Otter 数据混合后的数据样本。灰色对话框来自 LLaVA 数据集，橙色对话框来自 Otter 数据集。*
+</div>
+
+# 5. 演示
+我们在几个开源数据集上训练了我们的 DeepSpeed-VisualChat-13B 模型，该模型使用一个 2B 的视觉编码器和 13B 的 LLaMA 模型。DeepSpeed-VisualChat-13B 展示了图像字幕功能（*图 6--8*），计数和文本阅读（*图 6*），名人识别（*图 7*），讲故事（*图 8*）等。
+
+<div align="center">
+  <img src="../assets/images/friends.png" alt="朋友们" width="600"/>
+
+  *图 6：DeepSpeed-VisualChat 可以计算图像中的人数，并读取第一张图像中的文本。它还展示了跨图像的理解。*
+</div>
+
+<div align="center">
+  <img src="../assets/images/ceos.png" alt="CEO" width="600"/>
+
+  *图 7：DeepSpeed-VisualChat 可以识别名人并将他们与其成就联系起来。*
+</div>
+
+<div align="center">
+  <img src="../assets/images/zootopia.png" alt="疯狂动物城" width="600"/>
+
+  *图 8：DeepSpeed-VisualChat 可以讲故事并识别电影。*
+</div>
+
+# 6. 如何开始使用 DeepSpeed-VisualChat
+DeepSpeed-VisualChat 是一个易于使用的训练框架，具有很好的可扩展性，到目前为止已经在 LLaMa-2-70B 模型上进行了测试。我们为所有实验采用了统一的指令调优格式，模板如下所示。
+```
+<System Instruction>      % You are a powerful vision-language assistant.
+
+### Image 1: <image>       % some image, e.g., cat-1.png
+### Question: <question>   % please describe the image.
+### Answer: <answer>       % It's a cute black cat.
+
+### Image 2: <image>       % some image, e.g., cat-2.png
+### Image 3: <image>       % some image, e.g., cat-3.png
+### Question: <question>   % What's the difference between the three cats?
+### Answer: <answer>       % The colors of the three cats are different.
+...
+```
+
+使用 DeepSpeed-VisualChat 训练模型是简单和方便的。这里我们给出了基于 CLIP 视觉编码器和 LLaMa-7B 模型的一个例子：
+
+```
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-VisualChat/
+pip install -r requirements.txt
+cd training
+bash training_scripts/run_7b.sh
+```
+
+训练后的模型权重将自动保存为 Hugging Face 兼容版本，并且可以用于启动您自己的视觉聊天 API：
+```
+cd ../chat
+bash chat_scripts/run.sh # You need to change necessary variables, e.g, ckpt path
+```
+
+为了支持更大的模型推理，我们已经将 Hugging Face 大模型推理集成到我们的 DeepSpeed-VisualChat API 中。因此，用户可以根据 GPU 内存容量和模型大小选择不同数量的 GPU。
+
+请参考我们的 [GitHub 主页](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) 了解更多细节。
+
+# 7. 发布：今天尝试 DeepSpeed-VisualChat！
+
+我们非常兴奋地分享 DeepSpeed-VisualChat 现已开源并供 AI 社区使用。
+
+* 要开始使用，请访问我们的 DeepSpeed-VisualChat GitHub 页面：[GitHub 主页](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)
+
+* 我们将继续在您的反馈和支持下改进 DeepSpeed-VisualChat。我们的 [路线图](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-) 显示了目前支持的功能以及未来计划支持的功能。
+
+DeepSpeed-VisualChat 是更大的 DeepSpeed 生态系统的一部分，其中包括一系列深度学习系统和建模技术。要了解更多信息，
+
+* 请访问我们的 [网站](https://www.deepspeed.ai/) 了解详细的博客文章、教程和文档。
+* 在我们的 [英文 X(Twitter)](https://twitter.com/MSFTDeepSpeed)、[日语 X(Twitter)](https://twitter.com/MSFTDeepSpeedJP) 和 [中文知乎](https://www.zhihu.com/people/deepspeed) 上关注我们，以获取 DeepSpeed 的最新消息。
+
+我们欢迎您为 DeepSpeed 做出贡献！我们鼓励您报告问题、贡献 PRs、并在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 页面上参加讨论。有关更多详细信息，请查看我们的 [贡献指南](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)。我们对与大学、研究实验室、公司等进行合作持开放态度，例如共同进行深度学习研究、应用 DeepSpeed 为现实世界的 AI 模型和应用提供支持等等。对于此类请求（以及其他不适合 GitHub 的请求），请直接发送电子邮件至 deepspeed-info@microsoft.com。
+
+* 如果你喜欢我们的工作，请在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 和 [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/) 上为我们的仓库点“星”。
diff --git a/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md b/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md
new file mode 100755
index 000000000000..ec23e005bff6
--- /dev/null
+++ b/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md
@@ -0,0 +1,188 @@
+<div align="center">
+
+# DeepSpeed-VisualChat: 複数ラウンド・複数画像の入力が可能なAIチャット体験を実現
+</div>
+
+<div align="center">
+
+<img src="../assets/images/hero-figure.png" width="1000px" alt="DeepSpeed-VisualChat!"/>
+
+</div>
+
+DeepSpeed-VisualChatを引用する場合、[arxiv上のレポート](https://arxiv.org/abs/2309.14327)を引用してください。
+
+```
+@article{yao2023deepspeed-visualchat,
+  title={{DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention}},
+  author={Zhewei Yao and Xiaoxia Wu and Conglong Li and Minjia Zhang and Heyang Qin and Olatunji Ruwase and Ammar Ahmad Awan and Samyam Rajbhandari and Yuxiong He},
+  journal={arXiv preprint arXiv:2309.14327},
+  year={2023}
+}
+```
+
+# 1. 概要
+GPTやLLaMaのような大規模言語モデル（LLM）は、テキスト生成やテキスト理解などの多くのタスクにおいて、Zero-shot/Few-shot学習、あるいはinstructed fine-tuningによって、非常に優れた能力を示してきました。しかし、AIエージェントをより多様なタスクに対応させるには、マルチモーダルを扱う能力が必要です。例えば、AIエージェントは画像を読んだり、音声を聞いたり、ビデオを見たりすることができる必要があります。こうした機能は、テキストベースのLLMにはほとんどありません。
+
+近年、LLMに視覚的な能力を導入することは、研究・実践の両方において広く試みられています。特に、画像をそのまま与えて、LLMが理解できるようにする取り組みが行われています（大規模視覚言語モデル、略してLVLMなどと呼ばれる）。
+
+こうした分野における、既存の研究の主な問題は以下の通りです：
+
+* 視覚に関する質問への回答やキャプション付けのように、単一の画像に関連するタスクや、同時に入力される複数の画像の処理に重点が置かれており、画像とテキストが交互に入力されるような状況には対応していない
+* システムのスケーラビリティは、～10Bのパラメータを持つモデルに限定される
+
+しかし、本来はAIチャットエージェントには、複数のテキストと画像の両方が与えられる可能性があります。また、LLMの生成能力は、モデルサイズが大きくなるにつれて急速に向上することが知られており、～10Bのモデルではその能力が制限されてしまいます。
+
+これらの問題を解決するために、我々は以下の新たな機能を備えたDeepSpeed-VisualChat（詳細は[arxivのレポート](https://arxiv.org/abs/2309.14327)を参照）を開発しました:
+
+* ***完全にオープンソース化され、前例のないスケーラビリティを備えた複数ラウンド・複数画像を処理できるフレームワーク***： DeepSpeed-VisualChatは、完全にオープンソース化された先進的なフレームワークの1つであり、複数ラウンドを通じて画像とテキストが両方与えられる対話を可能にします。また、DeepSpeedを利用することで、比類ないスケーラビリティを実現しており、実際に2Bのビジュアルエンコーダーと70BのLLaMA-2デコーダーモデルで訓練を行えます。
+* ***マルチモーダル因果的注意(MMCA)***: マルチモーダルモデルのための新しいアテンションMMCA（Multi-Modal Causal Attention）を考案し、異なるモダリティ間で独立にアテンションの重みを計算します。MMCAは、従来のcross attentionに類似したものですが、生成タスクのためのcausal attentionを強化しており、追加のモジュールやパラメータが不要になります。また、標準的なcausal attentionと比較して、優れた訓練データ効率を示します。
+* ***順次与えられる画像とテキストを扱うためのデータブレンディング***: DeepSpeed-VisualChatは、既存のデータセットに様々なデータブレンディング技術を採用しています。これにより、順次与えられるテキストと画像の不足という、利用可能なオープンソースデータセットのほとんどに当てはまる課題を克服しています。
+
+# 2 モデルアーキテクチャの概要
+<div align="center">
+  <img src="../assets/images/model.png" alt="model arch" width="400"/>
+
+  *図1: モデルアーキテクチャの概要*
+
+</div>
+
+*図1*に示すように、DeepSpeed-VisualChatのモデルアーキテクチャは、CLIPのような視覚エンコーダー、LLaMa-7Bのような言語デコーダー、特徴アライメントを行う linear projectionレイヤの3つのコンポーネントで構成されています。モデルのほとんどのパラメータは固定されており、言語モデルのembeddingとlinear projectionレイヤのみが学習可能です。その結果、学習可能なパラメータの総数は O(10M) (LLaMa-2-13B) から O(100M) (LLaMa-2-70B) となります。
+
+# 3. DeepSpeed マルチモーダル Causal Attention (MMCA)
+
+マルチモーダルモデルで、画像とテキストをつなぐ一般的なattentionの機構は二つあります。一つはMiniGPTやQWen-VLで使われているようなcausal attentionで、もう一つはOtterやFlamingoで使われているようなcross attentionです。
+
+
+<div align="center">
+  <img src="../assets/images/attention.png" alt="Different attention mehanisms" width="1000"/>
+
+  *図2: 異なるアテンションの機構: 「ユーザー：画像を説明してください」という入力文と3つの画像トークン（I-token1、I-token2、I-token3）と組み合わせて与えた場合の、それぞれのattention機構の構成を示しています。左側では、標準的なcausal attentionによって、画像トークンをテキストとして扱う様子を示しています。中央は、テキストトークンに対する標準的なcausal attentionを維持しながら、画像に適用されるcross attentionを使用する様子を示しています。右側では、画像トークンはself attentionのみを行い、テキストトークンはテキスト／画像トークンへのアテンションを独立に計算するという、新しいマルチモーダルのためのアテンションの提案を、オレンジ色のマスクで強調して示しています。この仕組みは、Q, Kをクエリとキーとしたとき、 softmax($`QK^T \odot M_1`$)+ softmax($`QK^T \odot M_2`$)として定義されます。M $`\in`$ R<sup>10x10</sup>としたとき、$`M_1`$=[M==1], and $`M_2`$=[M==2] です。*
+</div>
+
+<b>Causal Attention（CA）</b>：CAに基づく方法は、視覚的特徴（最終的な視覚エンコーダ層の出力からの特徴）を単純にテキストの特徴量に投影し、テキスト埋め込み層以降の通常のテキストの特徴量と組み合わせてLLMに送り込むというものです。CAの利点は、LLMにおける本来のアテンション機構の自然な拡張であり、そのため余分なモジュールやパラメータを導入しないことです。しかし、このアプローチにはいくつかの直感的な問題があります：
+
+* 視覚トークンはすでに双方向に特徴量に変換されており、本来他の視覚トークンやテキストトークンとのアテンションの必要はありませんが、実際には前の視覚またはテキストトークンとのアテンションがあります。。
+* テキストトークンの場合、モデルは前のテキストトークンと画像トークンとの間でどのようにアテンションの重みを配分するかを学習する必要があります。これらの問題により、LVLMにおけるCAのデータ効率にはしばしば問題があることが分かりました。この問題への対処として、LLaVAとQWen-VLは、視覚的特徴とテキストの特徴を完全に対応させるために、視覚言語の事前学習を必要とします。
+
+<b>Cross Attention (CrA)</b>：代替案であるCross Attention (CrA) と CAの組み合わせは、より優れたデータ効率を示しますが、いくつかの欠点もあります：
+
+* モデルに新しいパラメーターを導入する必要があります。例えば、Otterは、Cross Attentionによって導入された新しいパラメータがあるため、LLaVAが数百万個の学習可能なパラメータを持つのに対し、15億個以上のパラメータを必要とします。これにより、学習コストと必要メモリ量が大幅に増加します。
+* 訓練中に会話の途中で画像が与えられた場合、前のテキストトークンは与えられた画像とのアテンションを求められないので、慎重な設計が必要です。
+
+<b>マルチモーダル Causal Attention (MMCA)</b>：これらの問題を解決するために、我々は新しいマルチモーダルCausal Attention (MMCA) を提案します。この機構は、CAと同様のパラメータ効率と、CrAと同様のデータ効率の、両方の利点を持つものです。全体的なアイデアは以下の通りです：
+
+* 視覚トークンは視覚エンコーダによってエンコードされるため、視覚トークンは自分自身とのアテンションのみを利用する。
+* テキストトークンについては、その前のすべてのトークンに注目する。ただし、前のテキストトークンと画像トークンに対して、それぞれ別々のアテンションの重み行列を持つ。
+
+MMCAの2つ目のポイントは、1つのモダリティに対するアテンションの重みが、もう1つのモダリティに影響を与える可能性があるということです。例えば、テキストトークンは、視覚情報よりもテキスト情報により大きなアテンションを持つかもしれません。そのため、アテンションの重み行列を両方のモダリティで正規化すると、視覚トークンのアテンションスコアが非常に小さくなる可能性があります。3つのアテンション機構の視覚化については、*図2*を参照してください。
+
+<b>出力例</b> まず、異なるアテンションの機構を採用した、画像を一つだけ用いた会話におけるDeepSpeed-VisualChatの能力を示す様々な例を紹介します。これらの実験では、LLaMA2-7B言語モデルとQWen-VL視覚エンコーダを視覚エンコーダとして併用します。これら2つのモデルはlinear projection layerを介して接続されています。このモデルは2つのLLaVaデータセットで学習を行いました。*図3*と*図4*で実証されているように、DeepSpeed-VisualChatはMMCAと組み合わされることで、画像内の視覚的な詳細を効果的に識別し、ユーザーのクエリに対して首尾一貫した応答を提供します。さらに、MMCAは、Causal AttentionとCross Attentionの両方から合成されたマスクを使用するような、別のアテンション機構と比べて、より包括的で正確な画像詳細の把握が可能です。また、CrAとCAの組み合わせやMMCAとは対照的に、CA単独では若干エラーが多く（*図3*）、推論能力の程度が低い（*図4*）可能性があることも明らかです。
+
+<div align="center">
+  <img src="../assets/images/cat-chat.png" alt="Small kitten" width="600"/>
+
+  *図3: (1) 標準的なcausal attention (CA) (2) cross attentionと組み合わせた標準的なcausal attention (CA+CrA) (3)DeepSpeed-VisualChatの特別なマルチモーダルCausal Attention (MMCA) の出力比較を示す視覚入力と言語入力の例。*
+</div>
+
+<div align="center">
+  <img src="../assets/images/lake-chat.png" alt="Beautiful lake" width="600"/>
+
+  *図4：DeepSpeed-VisualChatは、示された場面を美しい湖として正確に識別し、妥当な提案のセットを提示する。対照的に、ベースラインは画像を「ボート乗り場のあるドック」と誤認識している。*
+</div>
+
+# 4. データブレンディング
+
+[arxivのレポート](https://arxiv.org/abs/2309.14327)に記載されているように、訓練には3つのソースから9つのデータセットを使用しました。複数ラウンド・複数画像の入力を可能にするために決定的に欠けている要素は、適切なデータがないことです。我々が見つけた複数ラウンド・複数画像の唯一のデータソースはSparklesDialogueデータセットで、そこにはわずか6520サンプルしか含まれていません。この制限に対処するため、既存の単一画像または単一ラウンドのデータから、複数ラウンド・複数画像のデータを合成するために、単純なデータ連結とLLaVA-Otterデータ混合という2つの方法を採用しました。
+
+## 4.1 単純なデータ連結
+LLaVAモデルで利用する "llava" と "llava_dial" データセットでは、各サンプルは1つの画像に対する単一／複数ラウンドの会話で構成されています。ユーザーが複数の画像について逐次質問するシナリオをシミュレートするため、これら2つのデータセットに対して、簡単なデータ後処理を行いました。具体的には、ランダムな数のサンプルを1つのサンプルとして連結しました。 "llava" の場合は1～3個のサンプルを連結し、"llava_dial" の場合は1～2個のサンプルを連結しました。
+
+## 4.2 LLaVAとOtterのデータブレンディング
+
+LLaVAモデルで使用されているllavaとllava_dialデータセット、およびOtterモデルで使用されているotter_mimicit_cgdデータセットは、すべてCOCO train2017画像を使用しています。llavaデータセットとllava_dialデータセットには、各サンプルに1つの画像に対する単発／複数回の会話が含まれます。otter_mimicit_cgdデータセットでは、各サンプルは画像のペアに対する1ラウンドの会話を含んでいます。そこで、otter_mimicit_cgdデータセットの各サンプルについて、同じ画像を使うllavaとllava_dialのサンプルを探し、「llava/llava_dial会話 -> otter_mimicit_cgd会話」という流れで新しいサンプルを構築しました。
+
+<div align="center">
+  <img src="../assets/images/data-blending.png" alt="Friends" width="600"/>
+
+  *図5: LLaVA-Otterデータブレンド後のデータサンプル。灰色のダイアログボックスはLLaVAデータセットから、オレンジ色のダイアログボックスはOtterデータセットからのもの*
+</div>
+
+# 5. デモ
+いくつかのオープンソースデータセットで2Bビジュアルエンコーダーと13B LLaMAモデルを使い、DeepSpeed-VisualChat-13Bモデルを訓練しました。DeepSpeed-VisualChat-13Bは、画像キャプション機能（*図6-8*）、計数とテキスト読み取り（*図6*）、著名人の認識（*図7*）、ストーリーテリング（*図8*）などを示しています。
+
+<div align="center">
+  <img src="../assets/images/friends.png" alt="Friends" width="600"/>
+
+  *図6: DeepSpeed-VisualChatは、画像内の人数を数え、最初の画像のテキストを読み取ることができます。また、複数画像を横断的に理解することも可能です。*
+</div>
+
+
+<div align="center">
+  <img src="../assets/images/ceos.png" alt="CEO" width="600"/>
+
+  *図7: DeepSpeed-VisualChatは有名人を認識し、その人物の業績と関連付けることができます*
+</div>
+
+
+<div align="center">
+  <img src="../assets/images/zootopia.png" alt="Zootopia" width="600"/>
+
+  *図8: DeepSpeed-VisualChatは、ストーリーを作ったり、映画を認識したりできます。*
+</div>
+
+
+# 6. DeepSpeed-VisualChatを使い始めるには
+DeepSpeed-VisualChatは使いやすく、かつ優れたスケーラビリティを持つ学習フレームワークで、これまでLLaMa-2-70Bモデルでテストされています。
+すべての実験で統一された命令チューニング形式を採用しており、そのテンプレートを以下に示します。
+
+```
+<System Instruction>      % You are a powerful vision-language assistant.
+
+### Image 1: <image>       % some image, e.g., cat-1.png
+### Question: <question>   % please describe the image.
+### Answer: <answer>       % It's a cute black cat.
+
+### Image 2: <image>       % some image, e.g., cat-2.png
+### Image 3: <image>       % some image, e.g., cat-3.png
+### Question: <question>   % What's the difference between the three cats?
+### Answer: <answer>       % The colors of the three cats are different.
+...
+```
+
+DeepSpeed-VisualChatの訓練は簡単かつ便利に実行できます。ここではCLIPビジュアルエンコーダーとLLaMa-7Bモデルを使用する例を示します：
+
+```
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-VisualChat/
+pip install -r requirements.txt
+cd training
+bash training_scripts/run_7b.sh
+```
+
+訓練されたチェックポイントは自動的にHugging Faceと互換性のある形式で保存され、独自のビジュアルチャットAPIを提供するために使用できます：
+
+```
+cd ../chat
+bash chat_scripts/run.sh # You need to change necessary variables, e.g, ckpt path
+```
+
+より大規模なモデル推論をサポートするために、我々はHugging Faceの大規模モデル推論をDeepSpeed-VisualChat APIに組み込みました。そのため、ユーザーはGPUメモリ容量とモデルサイズに基づいて、異なるGPU数を選択することができます。
+
+詳細は[ランディングページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)をご参照ください。
+
+# 7. 早速使ってみましょう！
+
+DeepSpeed-VisualChatがオープンソース化され、AIコミュニティで利用できるようになったことを大変嬉しく思います。
+
+* まずは、DeepSpeed-VisualChatのGitHubページをご覧ください： [GitHubランディングページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)
+
+* DeepSpeed-VisualChatは、皆様からのフィードバックとサポートにより改良を続けていきます。私たちの[ロードマップ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-)は、現在サポートされている機能と将来的に計画している機能を示しています。
+
+DeepSpeed-VisualChatは、さまざまなDeep Learningシステムやモデリング技術を含む、より大きなDeepSpeedエコシステムの一部です。詳細については、以下をご覧ください。
+
+* 私たちの[ウェブサイト](https://www.deepspeed.ai/)で、詳細なブログ記事、チュートリアル、役立つドキュメントを提供しています。
+* DeepSpeedの最新ニュースは、[English X(Twitter)](https://twitter.com/MSFTDeepSpeed)、[Japanese X(Twitter)](https://twitter.com/MSFTDeepSpeedJP)、[Chinese Zhihu](https://www.zhihu.com/people/deepspeed)をフォローしてください。
+
+DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望（およびGitHubには適さないその他の話題）については<deepspeed-info@microsoft.com> まで直接メールをお送りください。
+
+* 私たちの[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/)および[DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/)リポジトリが気に入ったら、ぜひスターをつけてください！
diff --git a/blogs/deepspeed-visualchat/10-03-2023/README.md b/blogs/deepspeed-visualchat/10-03-2023/README.md
index 06dcd851766c..eac9d06c3cea 100755
--- a/blogs/deepspeed-visualchat/10-03-2023/README.md
+++ b/blogs/deepspeed-visualchat/10-03-2023/README.md
@@ -56,14 +56,14 @@ There are two common attention mechanisms used to connect the visual and textual
 
 <div align="center">
   <img src="../assets/images/attention.png" alt="Different attention mehanisms" width="1000"/>
-</div>
 
-*Figure 2: Different Attention Mechanisms: Examine the differing attention mechanisms using an input sentence "User: Please describe the image." coupled with three Image tokens (I-token1, I-token2, I-token3). On the left, we demonstrate standard causal attention, treating image tokens as text. In the middle, we present cross attention applied to images, while maintaining standard causal attention for text tokens. On the right, we illustrate our innovative multi-modal attention proposal where image tokens only perform self-attention, and text tokens attend to text/image tokens independently, highlighted with an orange mask. This mechanism is defined by: softmax($`QK^T \odot M_1`$)+ softmax($`QK^T \odot M_2`$) with Q and K as query and key, $`M_1`$=[M==1], and $`M_2`$=[M==2], with M $`\in`$ R<sup>10x10</sup> in this case.*
+  *Figure 2: Different Attention Mechanisms: Examine the differing attention mechanisms using an input sentence "User: Please describe the image." coupled with three Image tokens (I-token1, I-token2, I-token3). On the left, we demonstrate standard causal attention, treating image tokens as text. In the middle, we present cross attention applied to images, while maintaining standard causal attention for text tokens. On the right, we illustrate our innovative multi-modal attention proposal where image tokens only perform self-attention, and text tokens attend to text/image tokens independently, highlighted with an orange mask. This mechanism is defined by: softmax($`QK^T \odot M_1`$)+ softmax($`QK^T \odot M_2`$) with Q and K as query and key, $`M_1`$=[M==1], and $`M_2`$=[M==2], with M $`\in`$ R<sup>10x10</sup> in this case.*
+</div>
 
 
 <b>Causal Attention (CA)</b>: The CA-based method simply projects visual features (i.e., the features from the output of the final visual encoder layer) into textual features and combines them with the normal textual features after the textual embedding layer to feed into LLMs. The benefit of CA is that it's a natural extension of the original attention mechanism in LLMs, and as such, it doesn't introduce any extra modules or parameters. However, this approach raises some intuitive problems:
 
-* For a visual token, it attends to previous visual and textual tokens, even though visual tokens are already fully encoded in a bidirectional manner and do not need further attention from other visual tokens or the beginning of textual tokens.
+* For a visual token, it attends to previous visual and textual tokens, even though visual tokens are already fully encoded in a bidirectional manner and do not need further attention to other visual tokens or previous textual tokens.
 * For a textual token, the model needs to learn how to distribute its attention weights between its previous textual and image tokens. Due to these issues, we found that the data efficiency of CA in LVLMs is often problematic. To address this, LLaVA and QWen-VL require visual-language pretraining to fully align visual features with textual features.
 
 <b>Cross Attention (CrA)</b>: The alternative, cross attention (CrA), along with CA, exhibits better data efficiency but also comes with a few drawbacks:
diff --git a/csrc/cpu/lion/fused_lion.cpp b/csrc/cpu/lion/fused_lion.cpp
new file mode 100644
index 000000000000..708df7f0146a
--- /dev/null
+++ b/csrc/cpu/lion/fused_lion.cpp
@@ -0,0 +1,43 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "cpu_lion.h"
+
+// C++ interface
+
+void multi_tensor_lion(int chunk_size,
+                       at::Tensor noop_flag,
+                       std::vector<std::vector<at::Tensor>> tensor_lists, /*gpmv*/
+                       const float lr,
+                       const float beta1,
+                       const float beta2,
+                       const int step,
+                       const int mode,
+                       const float weight_decay)
+{
+    static bool initialized = false;
+    if (!initialized) {
+        create_lion_optimizer(0);
+        initialized = true;
+    }
+    for (int i = 0; i < tensor_lists[0].size(); i++) {
+        ds_lion_step(0,
+                     step,
+                     lr,
+                     beta1,
+                     beta2,
+                     weight_decay,
+                     tensor_lists[1][i],
+                     tensor_lists[0][i],
+                     tensor_lists[2][i]);
+    }
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("multi_tensor_lion",
+          &multi_tensor_lion,
+          "Compute and apply gradient update to parameters for Lion optimizer");
+}
diff --git a/csrc/includes/cpu_lion.h b/csrc/includes/cpu_lion.h
new file mode 100644
index 000000000000..76034ceb3459
--- /dev/null
+++ b/csrc/includes/cpu_lion.h
@@ -0,0 +1,233 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#define NOMINMAX  // Windows idiosyncrasy
+                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
+
+#include <stdio.h>
+#include <torch/extension.h>
+#include <cassert>
+#include "simd.h"
+
+#if defined(__ENABLE_CUDA__)
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include "cuda.h"
+#include "custom_cuda_layers.h"
+typedef __half ds_half_precision_t;
+#else
+#include <cmath>
+typedef unsigned short ds_half_precision_t;
+#endif
+
+#define STEP(SPAN)                                             \
+    void Step_##SPAN(float* _params,                           \
+                     float* grads,                             \
+                     float* _exp_avg,                          \
+                     size_t _param_size,                       \
+                     ds_half_precision_t* dev_param = nullptr, \
+                     bool half_precision = false);
+
+class Lion_Optimizer {
+public:
+    Lion_Optimizer(float alpha = 1e-3,
+                   float betta1 = 0.9,
+                   float betta2 = 0.999,
+                   float weight_decay = 0)
+        : _alpha(alpha), _betta1(betta1), _betta2(betta2), _weight_decay(weight_decay), _step(0)
+    {
+#if defined(__ENABLE_CUDA__)
+        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
+        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
+
+        _streams[0] = TrainingContext::Instance().GetCurrentStream();
+        _streams[1] = TrainingContext::Instance().GetNewStream();
+        _buf_index = false;
+#endif
+    }
+    ~Lion_Optimizer()
+    {
+#if defined(__ENABLE_CUDA__)
+        cudaFreeHost(_doubled_buffer[0]);
+        cudaFreeHost(_doubled_buffer[1]);
+#endif
+    }
+
+#if defined(__AVX512__) or defined(__AVX256__)
+    template <int span>
+    void Step_AVX(size_t* rounded_size,
+                  float* _params,
+                  float* grads,
+                  float* _exp_avg,
+                  size_t param_size,
+                  ds_half_precision_t* dev_param = nullptr,
+                  bool half_precision = false);
+#endif
+    STEP(1)
+    STEP(4)
+    STEP(8)
+#if defined(__ENABLE_CUDA__)
+    inline void SynchronizeStreams()
+    {
+        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
+    }
+#endif
+    inline void IncrementStep(size_t step, float beta1, float beta2)
+    {
+        _step++;
+        if (_step != step || beta1 != _betta1 || beta2 != _betta2) {
+            _step = step;
+            _betta1 = beta1;
+            _betta2 = beta2;
+        }
+    }
+    inline void update_state(float lr, float weight_decay)
+    {
+        _alpha = lr;
+        _weight_decay = weight_decay;
+    }
+
+private:
+    float _alpha;
+    float _betta1;
+    float _betta2;
+    float _weight_decay;
+    size_t _step;
+
+#if defined(__ENABLE_CUDA__)
+    float* _doubled_buffer[2];
+    cudaStream_t _streams[2];
+    bool _buf_index;
+#endif
+};
+
+#if defined(__AVX512__) or defined(__AVX256__)
+template <int span>
+void Lion_Optimizer::Step_AVX(size_t* rounded_size,
+                              float* _params,
+                              float* grads,
+                              float* _exp_avg,
+                              size_t _param_size,
+                              ds_half_precision_t* dev_params,
+                              bool half_precision)
+{
+    size_t new_rounded_size = 0;
+    int rshft = half_precision ? 1 : 0;
+
+    constexpr float neg1 = -1.0f;
+    AVX_Data neg1_4;
+    neg1_4.data = SIMD_SET(neg1);
+
+    AVX_Data betta1_4;
+    betta1_4.data = SIMD_SET(_betta1);
+    AVX_Data betta2_4;
+    betta2_4.data = SIMD_SET(_betta2);
+
+    float betta1_minus1 = 1 - _betta1;
+    float betta2_minus1 = 1 - _betta2;
+    AVX_Data betta1_minus1_4;
+    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
+    AVX_Data betta2_minus1_4;
+    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
+
+    float step_size = -_alpha;
+    AVX_Data step_size_4;
+    step_size_4.data = SIMD_SET(step_size);
+
+    float after_decay = 1.0f - _alpha * _weight_decay;
+    AVX_Data after_decay_4;
+    if (_weight_decay > 0) after_decay_4.data = SIMD_SET(after_decay);
+
+    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
+    for (size_t t = 0; t < new_rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
+        size_t offset = copy_size + t;
+#if defined(__ENABLE_CUDA__)
+        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#endif
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
+            AVX_Data grad_4[span];
+            simd_load<span>(grad_4, grads + (i >> rshft), half_precision);
+
+            AVX_Data momentum_4[span];
+            simd_load<span>(momentum_4, _exp_avg + i, false);
+
+            AVX_Data param_4[span];
+            simd_load<span>(param_4, _params + (i >> rshft), half_precision);
+
+            AVX_Data tmp_4[span];
+
+            simd_mul<span>(tmp_4, momentum_4, betta1_4);
+            simd_fma<span>(tmp_4, grad_4, betta1_minus1_4, tmp_4);
+            // We already used intrinsics, so consider the machine representation fixed.
+            simd_and<span>(tmp_4, tmp_4, neg1_4);
+            simd_xor<span>(tmp_4, tmp_4, step_size_4);
+            if (_weight_decay > 0) {
+                simd_fma<span>(param_4, param_4, after_decay_4, tmp_4);
+            } else {
+                simd_add<span>(param_4, param_4, tmp_4);
+            }
+
+            simd_mul<span>(momentum_4, momentum_4, betta2_4);
+            simd_fma<span>(momentum_4, grad_4, betta2_minus1_4, momentum_4);
+
+            simd_store<span>(_params + (i >> rshft), param_4, half_precision);
+#if defined(__ENABLE_CUDA__)
+            if (dev_params) {
+                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
+            }
+#endif
+            simd_store<span>(_exp_avg + i, momentum_4, false);
+        }
+#if defined(__ENABLE_CUDA__)
+        if (dev_params) {
+            if (half_precision)
+                launch_param_update_half(
+                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
+            else
+                launch_param_update(
+                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
+
+            _buf_index = !_buf_index;
+        }
+#endif
+    }
+    *rounded_size = new_rounded_size;
+}
+#endif
+
+int create_lion_optimizer(int optimizer_id,
+                          float alpha = 1e-3,
+                          float betta1 = 0.9,
+                          float betta2 = 0.999,
+                          float weight_decay = 0,
+                          bool should_log = false);
+
+int ds_lion_step(int optimizer_id,
+                 size_t step,
+                 float lr,
+                 float beta1,
+                 float beta2,
+                 float weight_decay,
+                 torch::Tensor& params,
+                 torch::Tensor& grads,
+                 torch::Tensor& exp_avg);
+
+int ds_lion_step_plus_copy(int optimizer_id,
+                           size_t step,
+                           float lr,
+                           float beta1,
+                           float beta2,
+                           float weight_decay,
+                           torch::Tensor& params,
+                           torch::Tensor& grads,
+                           torch::Tensor& exp_avg,
+                           torch::Tensor& gpu_params);
+
+int destroy_lion_optimizer(int optimizer_id);
diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h
index 712dd5b32e96..f77568be7835 100644
--- a/csrc/includes/simd.h
+++ b/csrc/includes/simd.h
@@ -24,6 +24,10 @@
 #define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
 #define SIMD_SQRT(x) _mm512_sqrt_ps(x)
 #define SIMD_DIV(x, y) _mm512_div_ps(x, y)
+#define SIMD_AND(x, y) _mm512_and_ps(x, y)
+#define SIMD_ANDNOT(x, y) _mm512_andnot_ps(x, y)
+#define SIMD_OR(x, y) _mm512_or_ps(x, y)
+#define SIMD_XOR(x, y) _mm512_xor_ps(x, y)
 #define SIMD_WIDTH 16
 
 #define SIMD_LOAD2(x, h) \
@@ -42,10 +46,14 @@
 #define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
 #define SIMD_SQRT(x) _mm256_sqrt_ps(x)
 #define SIMD_DIV(x, y) _mm256_div_ps(x, y)
+#define SIMD_AND(x, y) _mm256_and_ps(x, y)
+#define SIMD_ANDNOT(x, y) _mm256_andnot_ps(x, y)
+#define SIMD_OR(x, y) _mm256_or_ps(x, y)
+#define SIMD_XOR(x, y) _mm256_xor_ps(x, y)
 #define SIMD_WIDTH 8
+
 #define SIMD_LOAD2(x, h) \
     ((h) ? _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)x)) : _mm256_loadu_ps(x))
-
 #define SIMD_STORE2(x, d, h)                                                                \
     ((h) ? _mm_store_ps(x, _mm_castsi128_ps(_mm256_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
          : _mm256_storeu_ps(x, d))
@@ -136,5 +144,55 @@ inline void simd_div(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
 #pragma unroll
     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_DIV(src_a_l[i].data, src_a_r[i].data); }
 }
+template <int span>
+inline void simd_and(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_AND(src_a_l[i].data, src_a_r.data); }
+}
+template <int span>
+inline void simd_and(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_AND(src_a_l[i].data, src_a_r[i].data); }
+}
+template <int span>
+inline void simd_andnot(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ANDNOT(src_a_l[i].data, src_a_r.data); }
+}
+template <int span>
+inline void simd_andnot(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) {
+        dst[i].data = SIMD_ANDNOT(src_a_l[i].data, src_a_r[i].data);
+    }
+}
+template <int span>
+inline void simd_or(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_OR(src_a_l[i].data, src_a_r.data); }
+}
+template <int span>
+inline void simd_or(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_OR(src_a_l[i].data, src_a_r[i].data); }
+}
+template <int span>
+inline void simd_xor(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_XOR(src_a_l[i].data, src_a_r.data); }
+}
+template <int span>
+inline void simd_xor(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_XOR(src_a_l[i].data, src_a_r[i].data); }
+}
 
 #endif
diff --git a/csrc/lion/cpu_lion.cpp b/csrc/lion/cpu_lion.cpp
new file mode 100644
index 000000000000..a0562eac9c4a
--- /dev/null
+++ b/csrc/lion/cpu_lion.cpp
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "cpu_lion.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("lion_update", &ds_lion_step, "DeepSpeed CPU Lion update (C++)");
+    m.def("lion_update_copy",
+          &ds_lion_step_plus_copy,
+          "DeepSpeed CPU Lion update and param copy (C++)");
+    m.def("create_lion", &create_lion_optimizer, "DeepSpeed CPU Lion (C++)");
+    m.def("destroy_lion", &destroy_lion_optimizer, "DeepSpeed CPU Lion destroy (C++)");
+}
diff --git a/csrc/lion/cpu_lion_impl.cpp b/csrc/lion/cpu_lion_impl.cpp
new file mode 100644
index 000000000000..5c24e23b4b20
--- /dev/null
+++ b/csrc/lion/cpu_lion_impl.cpp
@@ -0,0 +1,255 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <torch/extension.h>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+#include <unordered_map>
+#include "cpu_lion.h"
+
+#if defined(__ENABLE_CUDA__)
+#include <cuda_runtime_api.h>
+#include "cublas_v2.h"
+#include "cuda.h"
+#include "curand.h"
+#include "custom_cuda_layers.h"
+#endif
+
+static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
+
+// C++ interface
+
+void Lion_Optimizer::Step_1(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            size_t _param_size,
+                            ds_half_precision_t* dev_params,
+                            bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<1>(&rounded_size, _params, grads, _exp_avg, _param_size, dev_params, half_precision);
+#endif
+    if (_param_size > rounded_size) {
+        float betta1_minus1 = 1 - _betta1;
+        float betta2_minus1 = 1 - _betta2;
+
+        float alpha = _alpha;
+        float after_decay = 1 - alpha * _weight_decay;
+        ds_half_precision_t* grads_cast_h;
+        ds_half_precision_t* params_cast_h;
+        if (half_precision) {
+            grads_cast_h = reinterpret_cast<ds_half_precision_t*>(grads);
+            params_cast_h = reinterpret_cast<ds_half_precision_t*>(_params);
+        }
+
+        for (size_t t = rounded_size; t < _param_size; t += TILE) {
+            size_t copy_size = TILE;
+            if ((t + TILE) > _param_size) copy_size = _param_size - t;
+            size_t offset = copy_size + t;
+#if defined(__ENABLE_CUDA__)
+            if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#endif
+#pragma omp parallel for
+            for (size_t k = t; k < offset; k++) {
+                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
+                float param = half_precision ? (float)params_cast_h[k] : _params[k];
+                float momentum = _exp_avg[k];
+                float tmp = momentum * _betta1;
+                tmp = grad * betta1_minus1 + tmp;
+                // Rely on portable C++ methods to manipulate the sign bit of a floating-point
+                // number.
+                tmp = -std::copysignf(alpha, tmp);
+                if (_weight_decay > 0) {
+                    param = param * after_decay + tmp;
+                } else {
+                    param = param + tmp;
+                }
+                momentum = momentum * _betta2;
+                momentum = grad * betta2_minus1 + momentum;
+#if defined(__ENABLE_CUDA__)
+                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
+#endif
+                if (half_precision)
+                    params_cast_h[k] = (ds_half_precision_t)param;
+                else
+                    _params[k] = param;
+                _exp_avg[k] = momentum;
+            }
+#if defined(__ENABLE_CUDA__)
+            if (dev_params) {
+                launch_param_update(
+                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
+
+                _buf_index = !_buf_index;
+            }
+#endif
+        }
+    }
+}
+
+void Lion_Optimizer::Step_4(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            size_t _param_size,
+                            ds_half_precision_t* dev_params,
+                            bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<4>(&rounded_size, _params, grads, _exp_avg, _param_size, dev_params, half_precision);
+#endif
+    if (_param_size > rounded_size)
+        Step_1((_params + rounded_size),
+               (grads + rounded_size),
+               (_exp_avg + rounded_size),
+               (_param_size - rounded_size),
+               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
+               half_precision);
+}
+
+int create_lion_optimizer(int optimizer_id,
+                          float alpha,
+                          float betta1,
+                          float betta2,
+                          float weight_decay,
+                          bool should_log)
+{
+    auto opt = std::make_shared<Lion_Optimizer>(alpha, betta1, betta2, weight_decay);
+
+    s_optimizers[optimizer_id] = opt;
+
+    if (should_log) {
+        std::string avx_type = "";
+#if defined(__AVX512__)
+        avx_type = "AVX512";
+#else
+#if defined(__AVX256__)
+        avx_type = "AVX2";
+#else
+        avx_type = "scalar";
+#endif
+#endif
+
+        printf("Lion Optimizer #%d is created with %s arithmetic capability.\n",
+               optimizer_id,
+               avx_type.c_str());
+        printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f\n",
+               alpha,
+               betta1,
+               betta2,
+               weight_decay);
+    }
+
+    return 0;
+}
+
+void Lion_Optimizer::Step_8(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            size_t _param_size,
+                            ds_half_precision_t* dev_params,
+                            bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<8>(&rounded_size, _params, grads, _exp_avg, _param_size, dev_params, half_precision);
+#endif
+    if (_param_size > rounded_size)
+        Step_4((_params + rounded_size),
+               (grads + rounded_size),
+               (_exp_avg + rounded_size),
+               (_param_size - rounded_size),
+               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
+               half_precision);
+}
+
+int ds_lion_step(int optimizer_id,
+                 size_t step,
+                 float lr,
+                 float beta1,
+                 float beta2,
+                 float weight_decay,
+                 torch::Tensor& params,
+                 torch::Tensor& grads,
+                 torch::Tensor& exp_avg)
+{
+    auto params_c = params.contiguous();
+    auto grads_c = grads.contiguous();
+    auto exp_avg_c = exp_avg.contiguous();
+
+    // assert(params.options().dtype() == grads.options().dtype());
+
+    float* params_ptr = (float*)params_c.data_ptr();
+    float* grads_ptr = (float*)grads_c.data_ptr();
+    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
+
+    std::shared_ptr<Lion_Optimizer> opt =
+        std::static_pointer_cast<Lion_Optimizer>(s_optimizers[optimizer_id]);
+    opt->IncrementStep(step, beta1, beta2);
+    opt->update_state(lr, weight_decay);
+
+    opt->Step_8(params_ptr,
+                grads_ptr,
+                exp_avg_ptr,
+                params_c.numel(),
+                nullptr,
+                (params.options().dtype() == at::kHalf));
+
+#if defined(__ENABLE_CUDA__)
+    opt->SynchronizeStreams();
+#endif
+    return 0;
+}
+
+int ds_lion_step_plus_copy(int optimizer_id,
+                           size_t step,
+                           float lr,
+                           float beta1,
+                           float beta2,
+                           float weight_decay,
+                           torch::Tensor& params,
+                           torch::Tensor& grads,
+                           torch::Tensor& exp_avg,
+                           torch::Tensor& gpu_params)
+{
+#if defined(__ENABLE_CUDA__)
+    auto params_c = params.contiguous();
+    auto gpu_params_c = gpu_params.contiguous();
+    auto exp_avg_c = exp_avg.contiguous();
+    auto grads_c = grads.contiguous();
+
+    float* params_ptr = (float*)params_c.data_ptr();
+    float* grads_ptr = (float*)grads_c.data_ptr();
+    ds_half_precision_t* gpu_params_ptr = (ds_half_precision_t*)gpu_params_c.data_ptr();
+    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
+
+    std::shared_ptr<Lion_Optimizer> opt =
+        std::static_pointer_cast<Lion_Optimizer>(s_optimizers[optimizer_id]);
+    opt->IncrementStep(step, beta1, beta2);
+    opt->update_state(lr, weight_decay);
+    opt->Step_8(params_ptr,
+                grads_ptr,
+                exp_avg_ptr,
+                params_c.numel(),
+                gpu_params_ptr,
+                (params.options().dtype() == at::kHalf));
+
+    opt->SynchronizeStreams();
+#else
+    assert(false);
+#endif
+    return 0;
+}
+
+int destroy_lion_optimizer(int optimizer_id)
+{
+    s_optimizers.erase(optimizer_id);
+
+    return 0;
+}
diff --git a/csrc/lion/fused_lion_frontend.cpp b/csrc/lion/fused_lion_frontend.cpp
new file mode 100644
index 000000000000..e523f97ca309
--- /dev/null
+++ b/csrc/lion/fused_lion_frontend.cpp
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <torch/extension.h>
+
+void multi_tensor_lion_cuda(int chunk_size,
+                            at::Tensor noop_flag,
+                            std::vector<std::vector<at::Tensor>> tensor_lists,
+                            const float lr,
+                            const float beta1,
+                            const float beta2,
+                            const int step,
+                            const float weight_decay);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("multi_tensor_lion",
+          &multi_tensor_lion_cuda,
+          "Compute and apply gradient update to parameters for Lion optimizer");
+}
diff --git a/csrc/lion/multi_tensor_apply.cuh b/csrc/lion/multi_tensor_apply.cuh
new file mode 100644
index 000000000000..12f41cb49c6b
--- /dev/null
+++ b/csrc/lion/multi_tensor_apply.cuh
@@ -0,0 +1,132 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "compat.h"
+
+#include <assert.h>
+
+// #include <iostream>
+
+// This header is the one-stop shop for all your multi-tensor apply needs.
+
+// TODO:  Kernel arg size limit may be <4KB for some other cards (ie Jetson)
+constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
+constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+
+template <int n>
+struct TensorListMetadata {
+    void* addresses[n][depth_to_max_tensors[n - 1]];
+    int sizes[depth_to_max_tensors[n - 1]];
+    unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+    int block_to_chunk[depth_to_max_blocks[n - 1]];  // I fear this needs to be a full int.
+    int start_tensor_this_launch;
+};
+
+template <typename T, typename U, typename... ArgTypes>
+__global__ void multi_tensor_apply_kernel(int chunk_size,
+                                          volatile int* noop_flag,
+                                          T tl,
+                                          U callable,
+                                          ArgTypes... args)
+{
+    // Hand the chunk information to the user-supplied functor to process however it likes.
+    callable(chunk_size, noop_flag, tl, args...);
+}
+
+template <int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(int block_size,
+                        int chunk_size,
+                        const at::Tensor& noop_flag,
+                        const std::vector<std::vector<at::Tensor>>& tensor_lists,
+                        T callable,
+                        ArgTypes... args)
+{
+    TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
+    int len0 = tensor_lists[0].size();
+    TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
+    auto ref_device = tensor_lists[0][0].device();
+    TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
+    for (int l = 0; l < tensor_lists.size(); l++)  // No range-based for because I need indices
+    {
+        TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
+        for (int t = 0; t < tensor_lists[l].size(); t++) {
+            // TODO:  Print which tensor fails.
+            bool contiguous_memory = tensor_lists[l][t].is_contiguous();
+#ifdef VERSION_GE_1_5
+            contiguous_memory = (contiguous_memory ||
+                                 tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
+#endif
+            TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
+            TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
+                        "A tensor was not on the same device as the first tensor");
+            TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
+        }
+    }
+
+    int ntensors = tensor_lists[0].size();
+
+    TensorListMetadata<depth> tl;
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
+    auto stream = at::cuda::getCurrentCUDAStream();
+
+    tl.start_tensor_this_launch = 0;
+    int loc_block_info = 0;
+    int loc_tensor_info = 0;
+    for (int t = 0; t < ntensors; t++) {
+        tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+        for (int d = 0; d < depth; d++)
+            tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+        loc_tensor_info++;
+
+        int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
+
+        for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
+            // std::cout << chunks_this_tensor << std::endl;
+            tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+            tl.block_to_chunk[loc_block_info] = chunk;
+            loc_block_info++;
+
+            bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+                                 chunk == chunks_this_tensor - 1);
+            bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
+            bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
+            if (tensors_full || blocks_full || last_chunk) {
+                // using accscalar_t = acc_type<scalar_t, true>;
+                multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
+                    chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
+
+                AT_CUDA_CHECK(cudaGetLastError());
+
+                // Reset.  The control flow possibilities here make my brain hurt.
+                loc_block_info = 0;
+                if (chunk == chunks_this_tensor - 1) {
+                    // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 <<
+                    // std::endl;
+                    loc_tensor_info = 0;
+                    tl.start_tensor_this_launch = t + 1;
+                } else {
+                    // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 <<
+                    // std::endl;
+                    tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
+                    for (int d = 0; d < depth; d++)
+                        tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
+                    loc_tensor_info = 1;
+                    tl.start_tensor_this_launch = t;
+                }
+            }
+        }
+    }
+}
diff --git a/csrc/lion/multi_tensor_lion.cu b/csrc/lion/multi_tensor_lion.cu
new file mode 100644
index 000000000000..f5fe6dfdd7d0
--- /dev/null
+++ b/csrc/lion/multi_tensor_lion.cu
@@ -0,0 +1,126 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+// Another possibility:
+// #include <torch/all.h>
+
+#include <assert.h>
+
+#include "multi_tensor_apply.cuh"
+#include "type_shim.h"
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+using MATH_T = float;
+
+template <typename T>
+struct LionFunctor {
+    __device__ __forceinline__ void operator()(int chunk_size,
+                                               volatile int* noop_gmem,
+                                               TensorListMetadata<3>& tl,
+                                               const float beta1,
+                                               const float beta2,
+                                               const float lr,
+                                               const float decay)
+    {
+        // I'd like this kernel to propagate infs/nans.
+        // if(*noop_gmem == 1)
+        //   return;
+
+        int tensor_loc = tl.block_to_tensor[blockIdx.x];
+
+        // potentially use to pass in list of scalar
+        // int tensor_num = tl.start_tensor_this_launch + tensor_loc;
+
+        int chunk_idx = tl.block_to_chunk[blockIdx.x];
+        int n = tl.sizes[tensor_loc];
+
+        T* g = (T*)tl.addresses[0][tensor_loc];
+        g += chunk_idx * chunk_size;
+
+        T* p = (T*)tl.addresses[1][tensor_loc];
+        p += chunk_idx * chunk_size;
+
+        T* m = (T*)tl.addresses[2][tensor_loc];
+        m += chunk_idx * chunk_size;
+
+        n -= chunk_idx * chunk_size;
+
+        MATH_T after_decay = 1.0f - lr * decay;
+
+        // see note in multi_tensor_scale_kernel.cu
+        for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
+            MATH_T r_g[ILP];
+            MATH_T r_p[ILP];
+            MATH_T r_m[ILP];
+#pragma unroll
+            for (int ii = 0; ii < ILP; ii++) {
+                int i = i_start + threadIdx.x + ii * blockDim.x;
+                if (i < n && i < chunk_size) {
+                    r_g[ii] = g[i];
+                    r_p[ii] = p[i];
+                    r_m[ii] = m[i];
+                } else {
+                    r_g[ii] = MATH_T(0);
+                    r_p[ii] = MATH_T(0);
+                    r_m[ii] = MATH_T(0);
+                }
+            }
+#pragma unroll
+            for (int ii = 0; ii < ILP; ii++) {
+                MATH_T c = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
+                MATH_T update = c > 0 ? (-lr) : lr;
+                r_p[ii] = r_p[ii] * after_decay + update;
+                r_m[ii] = beta2 * r_m[ii] + (1 - beta2) * r_g[ii];
+            }
+#pragma unroll
+            for (int ii = 0; ii < ILP; ii++) {
+                int i = i_start + threadIdx.x + ii * blockDim.x;
+                if (i < n && i < chunk_size) {
+                    p[i] = r_p[ii];
+                    m[i] = r_m[ii];
+                }
+            }
+        }
+    }
+};
+
+void multi_tensor_lion_cuda(int chunk_size,
+                            at::Tensor noop_flag,
+                            std::vector<std::vector<at::Tensor>> tensor_lists,
+                            const float lr,
+                            const float beta1,
+                            const float beta2,
+                            const int step,
+                            const float weight_decay)
+{
+    using namespace at;
+
+    // Assume single type across p,g,m1,m2 now
+    DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
+                                   0,
+                                   "lion",
+                                   multi_tensor_apply<3>(BLOCK_SIZE,
+                                                         chunk_size,
+                                                         noop_flag,
+                                                         tensor_lists,
+                                                         LionFunctor<scalar_t_0>(),
+                                                         beta1,
+                                                         beta2,
+                                                         lr,
+                                                         weight_decay);)
+
+    AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py
index f7cbc5ed1c15..531f54d2628f 100644
--- a/deepspeed/comm/torch.py
+++ b/deepspeed/comm/torch.py
@@ -17,23 +17,25 @@
 DS_COMM_REDUCE_OFF = False
 
 
-def is_torch_two():
-    TORCH_MAJOR = int(torch.__version__.split('.')[0])
-    if TORCH_MAJOR >= 2:
+def is_torch_ver_eq_2_0():
+    TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2])
+    if TORCH_MAJOR == 2 and TORCH_MINOR == 0:
         return True
-    else:
-        return False
+    return False
 
 
-def torch_ver_ge_1_13():
-    if is_torch_two():
+def is_torch_ver_ge_2_1():
+    TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2])
+    if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1:
         return True
-    else:
-        TORCH_MAJOR = int(torch.__version__.split('.')[0])
-        assert TORCH_MAJOR == 1
+    return False
 
-        TORCH_MINOR = int(torch.__version__.split('.')[1])
-        return TORCH_MINOR >= 13
+
+def torch_ver_ge_1_13():
+    TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2])
+    if TORCH_MAJOR >= 1 and TORCH_MINOR >= 13:
+        return True
+    return False
 
 
 def has_coalescing_manager():
@@ -45,9 +47,11 @@ def has_all_reduce_coalesced():
     return hasattr(torch.distributed, "all_reduce_coalesced") and torch_ver_ge_1_13()
 
 
-def get_coalescing_manager(group, device, reqs):
-    if is_torch_two():
+def get_coalescing_manager(group, device, reqs, async_op):
+    if is_torch_ver_eq_2_0():
         return torch.distributed.distributed_c10d._coalescing_manager(group, device=device, reqs=reqs)
+    elif is_torch_ver_ge_2_1():
+        return torch.distributed.distributed_c10d._coalescing_manager(group, device=device, async_ops=async_op)
     else:
         return torch.distributed.distributed_c10d._coalescing_manager(group, reqs)
 
@@ -234,7 +238,7 @@ def all_gather_coalesced(self, output_tensors, input_tensors, group=None, async_
                                                                                  async_op=async_op)
         elif has_coalescing_manager():
             reqs = []
-            with get_coalescing_manager(group, input_tensors[0].device, reqs):
+            with get_coalescing_manager(group, input_tensors[0].device, reqs, async_op):
                 for output, input in zip(output_tensors, input_tensors):
                     handle = torch.distributed.distributed_c10d.all_gather_into_tensor(output,
                                                                                        input,
diff --git a/deepspeed/module_inject/load_checkpoint.py b/deepspeed/module_inject/load_checkpoint.py
index 12b1799e49f2..a24d36344cdc 100644
--- a/deepspeed/module_inject/load_checkpoint.py
+++ b/deepspeed/module_inject/load_checkpoint.py
@@ -276,13 +276,6 @@ def load_module_recursive(module, prefix='', level=0):
                     level + 1)
 
     load_module_recursive(r_module)
-    embedding_weight = None
-
-    for n, p in r_module.named_parameters():
-        if "word_embeddings." in n or "embed_tokens." in n or "wte." in n:
-            embedding_weight = p
-    if embedding_weight is not None and r_module.lm_head.weight.is_meta:
-        r_module.lm_head.weight = embedding_weight
 
     for sd_ in sd:
         del sd_
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index af3c4f1c885d..f9dd921b9ae1 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -296,6 +296,15 @@ def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None):
 
         return new_module
 
+    def set_lm_head(module):
+        embedding_weight = None
+        for n, p in module.named_parameters():
+            if "word_embeddings." in n or "embed_tokens." in n or "wte." in n:
+                embedding_weight = p
+        if embedding_weight is not None and hasattr(module, "lm_head") and hasattr(
+                module.lm_head, "weight") and module.lm_head.weight.is_meta:
+            module.lm_head.weight = embedding_weight
+
     if checkpoint_dict is not None and not config.replace_with_kernel_inject:
         # AutoTP shard loading
         checkpoint = checkpoint_dict["checkpoints"]
@@ -309,6 +318,7 @@ def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None):
                                              checkpoint=checkpoint_file)
             pbar.update(1)
             gc.collect()
+        set_lm_head(replaced_module)
     else:
         replaced_module = replace_module(model=model,
                                          orig_class=orig_layer_impl,
@@ -386,6 +396,7 @@ def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None):
                                                container=container_g)
                     sds = [None for _ in sds]
                     gc.collect()
+        set_lm_head(replaced_module)
         print(f"checkpoint loading time at rank {rank}: {time.time()-start_time} sec")
 
     if config.save_mp_checkpoint_path is not None:
@@ -554,14 +565,6 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No
         "You can find some samples here: https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py"
 
     replaced_module, _ = _replace_module(model, policy, state_dict=sd)
-    if checkpoint is not None:
-        embedding_weight = None
-        for n, p in replaced_module.named_parameters():
-            if "word_embeddings." in n or "embed_tokens." in n or "wte." in n:
-                embedding_weight = p
-        if embedding_weight is not None and hasattr(replaced_module, "lm_head") and hasattr(
-                replaced_module.lm_head, "weight") and replaced_module.lm_head.weight.is_meta:
-            replaced_module.lm_head.weight = embedding_weight
     return replaced_module
 
 
diff --git a/deepspeed/ops/__init__.py b/deepspeed/ops/__init__.py
index b5a03c458a46..ba1c9c1fd9f0 100755
--- a/deepspeed/ops/__init__.py
+++ b/deepspeed/ops/__init__.py
@@ -6,6 +6,7 @@
 from . import adam
 from . import adagrad
 from . import lamb
+from . import lion
 #from ..git_version_info_installed import installed_ops as __installed_ops__
 #if __installed_ops__['sparse_attn']:
 from . import sparse_attention
diff --git a/deepspeed/ops/lion/__init__.py b/deepspeed/ops/lion/__init__.py
new file mode 100755
index 000000000000..2f90e5ec2e80
--- /dev/null
+++ b/deepspeed/ops/lion/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .cpu_lion import DeepSpeedCPULion
+from .fused_lion import FusedLion
diff --git a/deepspeed/ops/lion/cpu_lion.py b/deepspeed/ops/lion/cpu_lion.py
new file mode 100755
index 000000000000..a91a00643873
--- /dev/null
+++ b/deepspeed/ops/lion/cpu_lion.py
@@ -0,0 +1,141 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from cpuinfo import get_cpu_info
+from deepspeed.utils import logger
+from deepspeed.utils.logging import should_log_le
+from deepspeed.ops.op_builder import CPULionBuilder
+
+
+class DeepSpeedCPULion(torch.optim.Optimizer):
+    optimizer_id = 0
+
+    def __init__(self, model_params, lr=1e-3, betas=(0.9, 0.999), weight_decay=0, fp32_optimizer_states=True):
+        """Fast vectorized implementation of Lion optimizer on CPU:
+
+        See Symbolic Discovery of Optimization Algorithms (https://doi.org/10.48550/arXiv.2302.06675).
+
+        .. note::
+                We recommend using our `config
+                <https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`_
+                to allow :meth:`deepspeed.initialize` to build this optimizer
+                for you.
+
+
+        Arguments:
+            model_params (iterable): iterable of parameters to optimize or dicts defining
+                parameter groups.
+            lr (float, optional): learning rate. (default: 1e-3)
+            betas (Tuple[float, float], optional): coefficients used for computing
+                running averages of gradient and its square. (default: (0.9, 0.999))
+            weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+            full_precision_optimizer_states: creates momentum and variance in full precision regardless of
+                        the precision of the parameters (default: True)
+        """
+
+        default_args = dict(lr=lr, betas=betas, weight_decay=weight_decay)
+        super(DeepSpeedCPULion, self).__init__(model_params, default_args)
+
+        cpu_info = get_cpu_info()
+        self.cpu_vendor = cpu_info["vendor_id_raw"].lower() if "vendor_id_raw" in cpu_info else "unknown"
+        if "amd" in self.cpu_vendor:
+            for group_id, group in enumerate(self.param_groups):
+                for param_id, p in enumerate(group['params']):
+                    if p.dtype == torch.half:
+                        logger.warning("FP16 params for CPULion may not work on AMD CPUs")
+                        break
+                else:
+                    continue
+                break
+
+        self.opt_id = DeepSpeedCPULion.optimizer_id
+        DeepSpeedCPULion.optimizer_id = DeepSpeedCPULion.optimizer_id + 1
+        self.fp32_optimizer_states = fp32_optimizer_states
+        self.ds_opt_lion = CPULionBuilder().load()
+
+        self.ds_opt_lion.create_lion(self.opt_id, lr, betas[0], betas[1], weight_decay, should_log_le("info"))
+
+    def __del__(self):
+        # need to destroy the C++ object explicitly to avoid a memory leak when deepspeed.initialize
+        # is used multiple times in the same process (notebook or pytest worker)
+        self.ds_opt_lion.destroy_lion(self.opt_id)
+
+    def __setstate__(self, state):
+        super(DeepSpeedCPULion, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None, fp16_param_groups=None):
+        """Update the model parameters.
+
+        .. note::
+            This method will be called internally by ZeRO-Offload. DeepSpeed
+            users should still use ``engine.step()`` as shown in the
+            `Getting Started
+            <https://www.deepspeed.ai/getting-started/#training>`_ guide.
+
+        Args:
+            closure (callable, optional): closure to compute the loss.
+                Defaults to ``None``.
+            fp16_param_groups: FP16 GPU parameters to update. Performing the
+                copy here reduces communication time. Defaults to ``None``.
+
+        Returns:
+            loss: if ``closure`` is provided. Otherwise ``None``.
+        """
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        # intended device for step
+        device = torch.device('cpu')
+
+        # converting the fp16 params to a group of parameter
+        if type(fp16_param_groups) is list:
+            if type(fp16_param_groups[0]) is not list:
+                fp16_param_groups = [fp16_param_groups]
+        elif fp16_param_groups is not None:
+            fp16_param_groups = [[fp16_param_groups]]
+
+        for group_id, group in enumerate(self.param_groups):
+            for param_id, p in enumerate(group['params']):
+
+                if p.grad is None:
+                    continue
+
+                assert p.device == device, f"CPULion param is on {p.device} and must be 'cpu', make " \
+                        "sure you enabled 'offload_optimizer': 'cpu' in your ZeRO config."
+
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    #print(f'group {group_id} param {param_id} = {p.numel()}')
+                    state['step'] = 0
+
+                    #use full precision by default unless self.fp32_optimizer_states is off
+                    state_dtype = torch.float if self.fp32_optimizer_states else p.dtype
+
+                    # gradient momentums
+                    state['exp_avg'] = torch.zeros_like(p.data, dtype=state_dtype, device=device)
+                    #memory_format=torch.preserve_format)
+                    # gradient variances
+                    state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=state_dtype, device=device)
+                    #memory_format=torch.preserve_format)
+
+                state['step'] += 1
+                beta1, beta2 = group['betas']
+
+                if fp16_param_groups is not None:
+                    self.ds_opt_lion.lion_update_copy(self.opt_id, state['step'], group['lr'], beta1, beta2,
+                                                      group['weight_decay'], p.data, p.grad.data, state['exp_avg'],
+                                                      fp16_param_groups[group_id][param_id].data)
+                else:
+                    self.ds_opt_lion.lion_update(self.opt_id, state['step'], group['lr'], beta1, beta2,
+                                                 group['weight_decay'], p.data, p.grad.data, state['exp_avg'])
+        return loss
diff --git a/deepspeed/ops/lion/fused_lion.py b/deepspeed/ops/lion/fused_lion.py
new file mode 100644
index 000000000000..7332a7f96361
--- /dev/null
+++ b/deepspeed/ops/lion/fused_lion.py
@@ -0,0 +1,131 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+This file is modified from fused_adam.py
+"""
+
+import torch
+from .multi_tensor_apply import MultiTensorApply
+
+multi_tensor_applier = MultiTensorApply(2048 * 32)
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import FusedLionBuilder
+
+
+class FusedLion(torch.optim.Optimizer):
+    """Implements Lion algorithm.
+
+    Currently GPU-only.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        set_grad_none (bool, optional): whether set grad to None when zero_grad()
+            method is called. (default: True)
+
+    .. _Symbolic Discovery of Optimization Algorithms:
+        https://doi.org/10.48550/arXiv.2302.06675
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), weight_decay=0., set_grad_none=True):
+
+        defaults = dict(lr=lr, betas=betas, weight_decay=weight_decay)
+        super(FusedLion, self).__init__(params, defaults)
+        self.set_grad_none = set_grad_none
+
+        fused_lion_cuda = FusedLionBuilder().load()
+        # Skip buffer
+        self._dummy_overflow_buf = get_accelerator().IntTensor([0])
+        self.multi_tensor_lion = fused_lion_cuda.multi_tensor_lion
+
+    def zero_grad(self):
+        if self.set_grad_none:
+            for group in self.param_groups:
+                for p in group['params']:
+                    p.grad = None
+        else:
+            super(FusedLion, self).zero_grad()
+
+    def step(self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None, grad_scaler=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+
+        The remaining arguments are deprecated, and are only retained (for the moment) for error-checking purposes.
+        """
+        if any(p is not None for p in [grads, output_params, scale, grad_norms]):
+            raise RuntimeError('FusedLion has been updated.')
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            if len(group['params']) == 0:
+                continue
+            beta1, beta2 = group['betas']
+
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or pass list into kernel
+            if 'step' not in group:
+                group['step'] = 0
+
+            # create lists for multi-tensor apply
+            g_16, p_16, m_16 = [], [], []
+            g_bf, p_bf, m_bf = [], [], []
+            g_32, p_32, m_32 = [], [], []
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                if p.grad.data.is_sparse:
+                    raise NotImplementedError('FusedLion does not support sparse gradients')
+
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    # DeepSpeed ZeRO 3 processes each subgroup a time, so we need to keep tracking step count for each tensor separately.
+                    # While this is not an issue for ZeRO 1 & 2, since they apply a single optimization step to the whole param group at the same time.
+                    # In order to keep backward compatibility for the existing checkpoints, we use group['state'] to initialize state['step'] if it exists.
+                    state['step'] = group.get('step', 0)
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+
+                if p.dtype == torch.float16:
+                    g_16.append(p.grad.data)
+                    p_16.append(p.data)
+                    m_16.append(state['exp_avg'])
+                elif p.dtype == torch.bfloat16:
+                    g_bf.append(p.grad)
+                    p_bf.append(p)
+                    m_bf.append(state['exp_avg'])
+                elif p.dtype == torch.float32:
+                    g_32.append(p.grad.data)
+                    p_32.append(p.data)
+                    m_32.append(state['exp_avg'])
+                else:
+                    raise RuntimeError('FusedLion only support fp16, bf16 and fp32.')
+
+            if len(g_16) > 0:
+                state['step'] += 1
+                multi_tensor_applier(self.multi_tensor_lion, self._dummy_overflow_buf, [g_16, p_16, m_16], group['lr'],
+                                     beta1, beta2, state['step'], group['weight_decay'])
+
+            if len(g_bf) > 0:
+                state['step'] += 1
+                multi_tensor_applier(self.multi_tensor_lion, self._dummy_overflow_buf, [g_bf, p_bf, m_bf], group['lr'],
+                                     beta1, beta2, state['step'], group['weight_decay'])
+
+            if len(g_32) > 0:
+                state['step'] += 1
+                multi_tensor_applier(self.multi_tensor_lion, self._dummy_overflow_buf, [g_32, p_32, m_32], group['lr'],
+                                     beta1, beta2, state['step'], group['weight_decay'])
+
+        return loss
diff --git a/deepspeed/ops/lion/multi_tensor_apply.py b/deepspeed/ops/lion/multi_tensor_apply.py
new file mode 100644
index 000000000000..0ba228505cef
--- /dev/null
+++ b/deepspeed/ops/lion/multi_tensor_apply.py
@@ -0,0 +1,17 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Copyright NVIDIA/apex
+This file is adapted from NVIDIA/apex, commit a109f85
+"""
+
+
+class MultiTensorApply(object):
+
+    def __init__(self, chunk_size):
+        self.chunk_size = chunk_size
+
+    def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
+        return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args)
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index 77f1cd51b02a..c31b9671296f 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -77,9 +77,10 @@
 MUADAM_OPTIMIZER = 'muadam'
 MUADAMW_OPTIMIZER = 'muadamw'
 MUSGD_OPTIMIZER = 'musgd'
+LION_OPTIMIZER = 'lion'
 DEEPSPEED_OPTIMIZERS = [
     ADAGRAD_OPTIMIZER, ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER,
-    ZERO_ONE_ADAM_OPTIMIZER, MUADAM_OPTIMIZER, MUADAMW_OPTIMIZER, MUSGD_OPTIMIZER
+    ZERO_ONE_ADAM_OPTIMIZER, MUADAM_OPTIMIZER, MUADAMW_OPTIMIZER, MUSGD_OPTIMIZER, LION_OPTIMIZER
 ]
 
 # extra optimizer parameters for adam/adamw
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 22d7c882eb1a..b4c8ef56c701 100644
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -37,7 +37,8 @@
 
 from deepspeed.runtime.config import DEEPSPEED_OPTIMIZERS, \
     ADAGRAD_OPTIMIZER, ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER, \
-    TORCH_ADAM_PARAM, ADAM_W_MODE, ADAM_W_MODE_DEFAULT, ZERO_ONE_ADAM_OPTIMIZER, MUADAM_OPTIMIZER, MUADAMW_OPTIMIZER, MUSGD_OPTIMIZER
+    TORCH_ADAM_PARAM, ADAM_W_MODE, ADAM_W_MODE_DEFAULT, ZERO_ONE_ADAM_OPTIMIZER, MUADAM_OPTIMIZER, MUADAMW_OPTIMIZER, \
+    MUSGD_OPTIMIZER, LION_OPTIMIZER
 
 from deepspeed.runtime.dataloader import DeepSpeedDataLoader
 from deepspeed.runtime.constants import \
@@ -1296,6 +1297,13 @@ def _configure_basic_optimizer(self, model_parameters):
             optimizer = OnebitLamb(model_parameters, self, **optimizer_parameters)
             if not self.fp16_enabled():
                 logger.warning(f"Currently the convergence of 1-bit Lamb is only verified under FP16")
+        elif self.optimizer_name() == LION_OPTIMIZER:
+            if self.zero_use_cpu_optimizer():
+                from deepspeed.ops.lion import DeepSpeedCPULion
+                optimizer = DeepSpeedCPULion(model_parameters, **optimizer_parameters)
+            else:
+                from deepspeed.ops.lion import FusedLion
+                optimizer = FusedLion(model_parameters, **optimizer_parameters)
         elif self.optimizer_name() == MUADAM_OPTIMIZER:
             try:
                 from mup import MuAdam
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 4e8a0faa0ae0..0c7c9f7a1090 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -386,7 +386,7 @@ def train_batch(self, data_iter=None):
         # TODO: should return precisely what loss returned and allow others to be queried?
         return self.agg_train_loss
 
-    def eval_batch(self, data_iter, return_logits=False, compute_loss=True, reduce_output='avg'):
+    def eval_batch(self, data_iter, return_logits=False, compute_loss=True, reduce_output='avg', bcast_loss=True):
         """Evaluate the pipeline on a batch of data from ``data_iter``. The
         engine will evaluate ``self.train_batch_size()`` total samples
         collectively across all workers.
@@ -449,7 +449,7 @@ def eval_batch(self, data_iter, return_logits=False, compute_loss=True, reduce_o
         if self.is_last_stage():
             eval_output = self._reduce_outputs(self.fwd_outputs, reduce=reduce_output)
 
-        if compute_loss:
+        if compute_loss and (bcast_loss or self.monitor.enabled):
             eval_output = self._bcast_pipe_scalar(eval_output)
 
         if self.global_rank == 0 and self.monitor.enabled:
diff --git a/deepspeed/runtime/zero/utils.py b/deepspeed/runtime/zero/utils.py
index 0250796f793d..0bf1ca4a894d 100755
--- a/deepspeed/runtime/zero/utils.py
+++ b/deepspeed/runtime/zero/utils.py
@@ -12,6 +12,7 @@
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
 from deepspeed.ops.adam import FusedAdam
+from deepspeed.ops.lion import DeepSpeedCPULion, FusedLion
 from deepspeed.utils.nvtx import instrument_w_nvtx
 from deepspeed.accelerator import get_accelerator
 
@@ -37,7 +38,8 @@ class ZeRORuntimeException(Exception):
 
 
 ZERO_SUPPORTED_OPTIMIZERS = [
-    torch.optim.Adam, torch.optim.AdamW, FusedAdam, DeepSpeedCPUAdam, torch.optim.Adagrad, DeepSpeedCPUAdagrad
+    torch.optim.Adam, torch.optim.AdamW, FusedAdam, DeepSpeedCPUAdam, torch.optim.Adagrad, DeepSpeedCPUAdagrad,
+    DeepSpeedCPULion, FusedLion
 ]
 
 # Add apex FusedAdam to supported list if apex is installed
diff --git a/docs/_posts/2023-10-04-deepspeed-visualchat-chinese.md b/docs/_posts/2023-10-04-deepspeed-visualchat-chinese.md
new file mode 100644
index 000000000000..290b8b4b8ba4
--- /dev/null
+++ b/docs/_posts/2023-10-04-deepspeed-visualchat-chinese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed-VisualChat：多轮图像+文字，为你展现不一样的AI聊天魅力"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md
+date: 2023-10-04 00:00:00
+tags: training Chinese
+---
diff --git a/docs/_posts/2023-10-04-deepspeed-visualchat-japanese.md b/docs/_posts/2023-10-04-deepspeed-visualchat-japanese.md
new file mode 100644
index 000000000000..f8b7e20cc2cf
--- /dev/null
+++ b/docs/_posts/2023-10-04-deepspeed-visualchat-japanese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed-VisualChat: 複数ラウンド・複数画像の入力が可能なAIチャット体験を実現"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md
+date: 2023-10-04 00:00:00
+tags: training Japanese
+---
diff --git a/docs/_posts/2023-10-04-deepspeed-visualchat.md b/docs/_posts/2023-10-04-deepspeed-visualchat.md
new file mode 100644
index 000000000000..74a1eb66fd5c
--- /dev/null
+++ b/docs/_posts/2023-10-04-deepspeed-visualchat.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md
+date: 2023-10-04 00:00:00
+tags: training English
+---
diff --git a/docs/_tutorials/advanced-install.md b/docs/_tutorials/advanced-install.md
index 0558433994ca..c2b4c04cad1c 100755
--- a/docs/_tutorials/advanced-install.md
+++ b/docs/_tutorials/advanced-install.md
@@ -60,7 +60,9 @@ Available `DS_BUILD` options include:
 * `DS_BUILD_AIO` builds asynchronous (NVMe) I/O op
 * `DS_BUILD_CCL_COMM` builds the communication collective libs
 * `DS_BUILD_CPU_ADAM` builds the CPUAdam op
+* `DS_BUILD_CPU_LION` builds the CPULion op
 * `DS_BUILD_FUSED_ADAM` builds the FusedAdam op (from [apex](https://github.com/NVIDIA/apex))
+* `DS_BUILD_FUSED_LION` builds the FusedLion op
 * `DS_BUILD_CPU_ADAGRAD` builds the CPUAdagrad op
 * `DS_BUILD_FUSED_LAMB` builds the FusedLamb op
 * `DS_BUILD_QUANTIZER` builds the quantizer op
diff --git a/docs/index.md b/docs/index.md
index 79fd6baae250..14c131a9a22d 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -7,7 +7,7 @@ title: "Latest News"
 ---
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
-* [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md)
+* [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md)]
 * [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[DeepSpeed4Science website](https://deepspeed4science.ai/)] [[Tutorials](/deepspeed4science/)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]
 * [2023/08] [DeepSpeed ZeRO-Inference: 20X faster inference through weight quantization and KV cache offloading](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md)
 * [2023/08] [DeepSpeed-Chat: Llama/Llama-2 system support, efficiency boost, and training stability improvements](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31/README.md)
diff --git a/op_builder/async_io.py b/op_builder/async_io.py
index 2db18e3629a1..da511a0a8c9d 100644
--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -5,6 +5,7 @@
 
 import distutils.spawn
 import subprocess
+import torch
 
 from .builder import OpBuilder
 
@@ -35,11 +36,16 @@ def cxx_args(self):
         # -O0 for improved debugging, since performance is bound by I/O
         CPU_ARCH = self.cpu_arch()
         SIMD_WIDTH = self.simd_width()
+        TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2])
+        if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1:
+            CPP_STD = '-std=c++17'
+        else:
+            CPP_STD = '-std=c++14'
         return [
             '-g',
             '-Wall',
             '-O0',
-            '-std=c++14',
+            CPP_STD,
             '-shared',
             '-fPIC',
             '-Wno-reorder',
diff --git a/op_builder/cpu_lion.py b/op_builder/cpu_lion.py
new file mode 100644
index 000000000000..5c16d10ebb44
--- /dev/null
+++ b/op_builder/cpu_lion.py
@@ -0,0 +1,48 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+from .builder import TorchCPUOpBuilder
+
+
+class CPULionBuilder(TorchCPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_LION"
+    NAME = "cpu_lion"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.lion.{self.NAME}_op'
+
+    def sources(self):
+        if self.build_for_cpu:
+            return ['csrc/lion/cpu_lion.cpp', 'csrc/lion/cpu_lion_impl.cpp']
+
+        return ['csrc/lion/cpu_lion.cpp', 'csrc/lion/cpu_lion_impl.cpp', 'csrc/common/custom_cuda_kernel.cu']
+
+    def libraries_args(self):
+        args = super().libraries_args()
+        if self.build_for_cpu:
+            return args
+
+        if not self.is_rocm_pytorch():
+            args += ['curand']
+
+        return args
+
+    def include_paths(self):
+        import torch
+        if self.build_for_cpu:
+            CUDA_INCLUDE = []
+        elif not self.is_rocm_pytorch():
+            CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
+        else:
+            CUDA_INCLUDE = [
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"),
+            ]
+        return ['csrc/includes'] + CUDA_INCLUDE
diff --git a/op_builder/fused_lion.py b/op_builder/fused_lion.py
new file mode 100644
index 000000000000..b900a8f2369d
--- /dev/null
+++ b/op_builder/fused_lion.py
@@ -0,0 +1,37 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import CUDAOpBuilder
+
+import sys
+
+
+class FusedLionBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_FUSED_LION"
+    NAME = "fused_lion"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.lion.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/lion/fused_lion_frontend.cpp', 'csrc/lion/multi_tensor_lion.cu']
+
+    def include_paths(self):
+        return ['csrc/includes', 'csrc/lion']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(
+                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
+                self.compute_capability_args())
+        return nvcc_flags
diff --git a/tests/unit/checkpoint/test_mics_optimizer.py b/tests/unit/checkpoint/test_mics_optimizer.py
index 5d0bff3967c4..3f853cd5c13a 100644
--- a/tests/unit/checkpoint/test_mics_optimizer.py
+++ b/tests/unit/checkpoint/test_mics_optimizer.py
@@ -8,13 +8,16 @@
 
 import deepspeed
 
+from deepspeed.runtime.utils import required_torch_version
 from unit.common import DistributedTest
 from unit.simple_model import *
-
 from unit.checkpoint.common import *
 
 import pytest
 
+if not required_torch_version(max_version=2.0):
+    pytest.skip("Skipping until we resolve problems with torch 2.1", allow_module_level=True)
+
 
 class TestMiCSCheckpoint(DistributedTest):
     world_size = 4
diff --git a/tests/unit/ops/lion/test_cpu_lion.py b/tests/unit/ops/lion/test_cpu_lion.py
new file mode 100644
index 000000000000..61a069af3257
--- /dev/null
+++ b/tests/unit/ops/lion/test_cpu_lion.py
@@ -0,0 +1,96 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import numpy as np
+import pytest
+from cpuinfo import get_cpu_info
+
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.lion import FusedLion
+from deepspeed.ops.op_builder import CPULionBuilder
+from unit.common import DistributedTest
+
+if not deepspeed.ops.__compatible_ops__[CPULionBuilder.NAME]:
+    pytest.skip("cpu-lion is not compatible", allow_module_level=True)
+
+pytest.cpu_vendor = get_cpu_info()["vendor_id_raw"].lower()
+
+
+def check_equal(first, second, atol=1e-2, verbose=False):
+    x = first.detach().numpy()
+    y = second.detach().numpy()
+    print("ATOL", atol)
+    if verbose:
+        print("x = {}".format(x.flatten()))
+        print("y = {}".format(y.flatten()))
+        print('-' * 80)
+    np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)
+
+
+def _compare_optimizers(model_size, param1, optimizer1, param2, optimizer2):
+    for i in range(10):
+        param1.grad = torch.randn(model_size, device=param1.device).to(param1.dtype)
+        param2.grad = param1.grad.clone().detach().to(device=param2.device, dtype=param2.dtype)
+
+        optimizer1.step()
+        optimizer2.step()
+
+    tolerance = param1.float().norm().detach().numpy() * 1e-2
+    check_equal(param1.float().norm(), param2.float().cpu().norm(), atol=tolerance, verbose=True)
+
+
+@pytest.mark.parametrize('dtype', [torch.half, torch.float], ids=["fp16", "fp32"])
+@pytest.mark.parametrize('model_size',
+                         [
+                             (64),
+                             (22),
+                             #(55),
+                             (128),
+                             (1024),
+                             (1048576),
+                         ]) # yapf: disable
+class TestCPULion(DistributedTest):
+    world_size = 1
+    reuse_dist_env = True
+    requires_cuda_env = False
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    @pytest.mark.skipif(not get_accelerator().is_available(), reason="only supported in CUDA environments.")
+    def test_fused_lion_equal(self, dtype, model_size):
+        if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
+            pytest.skip("cpu-lion with half precision not supported on AMD CPUs")
+
+        from deepspeed.ops.lion import DeepSpeedCPULion
+
+        cpu_data = torch.randn(model_size, device='cpu').to(dtype)
+        cpu_param = torch.nn.Parameter(cpu_data)
+        cuda_param = torch.nn.Parameter(cpu_data.to(get_accelerator().device_name()))
+
+        cpu_optimizer = DeepSpeedCPULion([cpu_param])
+        cuda_optimizer = FusedLion([cuda_param])
+
+        _compare_optimizers(model_size=model_size,
+                            param1=cpu_param,
+                            optimizer1=cpu_optimizer,
+                            param2=cuda_param,
+                            optimizer2=cuda_optimizer)
+
+
+class TestCPULionGPUError(DistributedTest):
+
+    def test_cpu_lion_gpu_error(self):
+        model_size = 64
+        from deepspeed.ops.lion import DeepSpeedCPULion
+        device = get_accelerator().device_name(0)  # 'cuda:0' or 'xpu:0'
+        param = torch.nn.Parameter(torch.randn(model_size, device=device))
+        optimizer = DeepSpeedCPULion([param])
+
+        param.grad = torch.randn(model_size, device=device)
+        with pytest.raises(AssertionError):
+            optimizer.step()
diff --git a/tests/unit/ops/lion/test_lion.py b/tests/unit/ops/lion/test_lion.py
new file mode 100644
index 000000000000..b2c3ac2f52df
--- /dev/null
+++ b/tests/unit/ops/lion/test_lion.py
@@ -0,0 +1,59 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import deepspeed
+import torch
+import pytest
+
+from deepspeed.ops.lion import FusedLion
+from deepspeed.ops.lion import DeepSpeedCPULion
+from unit.common import DistributedTest
+from unit.simple_model import SimpleModel
+from deepspeed.accelerator import get_accelerator
+
+if torch.half not in get_accelerator().supported_dtypes():
+    pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
+# yapf: disable
+#'optimizer, zero_offload, resulting_optimizer
+lion_configs = [["Lion",  False, FusedLion],
+                ["Lion",  True,  DeepSpeedCPULion]]
+
+@pytest.mark.parametrize(
+    'optimizer, zero_offload, resulting_optimizer',
+    lion_configs)
+class TestLionConfigs(DistributedTest):
+    world_size = 1
+    reuse_dist_env = True
+
+    def test(self,
+             optimizer,
+             zero_offload,
+             resulting_optimizer):
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": optimizer,
+                "params": {
+                    "lr": 0.00015,
+                }
+            },
+            "gradient_clipping": 1.0,
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": 2,
+                "cpu_offload": zero_offload
+            }
+        }
+        model = SimpleModel(10)
+        model, _, _, _ = deepspeed.initialize(config=config_dict,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        # get base optimizer under zero
+        ds_optimizer = model.optimizer.optimizer
+        opt_class = resulting_optimizer
+        assert isinstance(ds_optimizer, opt_class)
diff --git a/version.txt b/version.txt
index 9b40aa6c214f..af88ba824866 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.10.4
+0.11.1