diff --git a/.github/workflows/nv-torch-latest-cpu.yml b/.github/workflows/cpu-torch-latest.yml
similarity index 98%
rename from .github/workflows/nv-torch-latest-cpu.yml
rename to .github/workflows/cpu-torch-latest.yml
index 60f9332f835d..ba4906db15c9 100644
--- a/.github/workflows/nv-torch-latest-cpu.yml
+++ b/.github/workflows/cpu-torch-latest.yml
@@ -1,4 +1,4 @@
-name: nv-torch-latest-cpu
+name: cpu-torch-latest
 
 on:
   workflow_dispatch:
diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
index d7db447f5d26..960e0203919e 100644
--- a/.github/workflows/nv-a6000.yml
+++ b/.github/workflows/nv-a6000.yml
@@ -47,7 +47,8 @@ jobs:
       - name: Install deepspeed
         run: |
           python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
-          python -m pip install .[dev,1bit,autotuning]
+          python -m pip install pydantic==1.10.11
+          python -m pip install .[dev,1bit,autotuning,inf]
           ds_report
       - name: Python environment
         run: |
diff --git a/README.md b/README.md
index 7cce70fe105c..c8b30eb104c6 100755
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@
 ## Latest News
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
+* [2024/03] [DeepSpeed-FP6:The power of FP6-Centric Serving for Large Language Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)]
 * [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
 * [2023/11] [Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed](https://github.com/microsoft/DeepSpeed/tree/master/blogs/intel-inference) [[Intel version]](https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html)
 * [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp)
@@ -130,7 +131,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram
 | ----------- | ------ |
 | NVIDIA | [![nv-torch110-p40](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-p40.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-p40.yml) [![nv-torch110-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-v100.yml) [![nv-torch-latest-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-h100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-h100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-h100.yml) [![nv-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml) |
 | AMD | [![amd-mi200](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml) |
-| CPU | [![nv-torch-latest-cpu](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-cpu.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-cpu.yml) [![cpu-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml) |
+| CPU | [![nv-torch-latest-cpu](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml) [![cpu-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml) |
 | PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
 | Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) |
 | Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) |
@@ -262,6 +263,11 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information
 28. Shuaiwen Leon Song, Bonnie Kruft, Minjia Zhang, Conglong Li, Shiyang Chen, Chengming Zhang, Masahiro Tanaka, Xiaoxia Wu, Jeff Rasley, Ammar Ahmad Awan, Connor Holmes, Martin Cai, Adam Ghanem, Zhongzhu Zhou, Yuxiong He, et al. (2023) DeepSpeed4Science Initiative: Enabling Large-Scale Scientific Discovery through Sophisticated AI System Technologies [arXiv:2310.04610](https://arxiv.org/abs/2310.04610) [[blog]](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)
 29. Zhewei Yao, Reza Yazdani Aminabadi, Stephen Youn, Xiaoxia Wu, Elton Zheng, Yuxiong He. (2023) ZeroQuant-HERO: Hardware-Enhanced Robust Optimized Post-Training Quantization Framework for W8A8 Transformers [arXiv:2310.17723](https://arxiv.org/abs/2310.17723)
 
+30. Xiaoxia Wu, Haojun Xia, Stephen Youn, Zhen Zheng, Shiyang Chen, Arash Bakhtiari, Michael Wyatt, Reza Yazdani Aminabadi, Yuxiong He, Olatunji Ruwase, Leon Song, Zhewei Yao (2023) ZeroQuant(4+2): Redefining LLMs Quantization with a New FP6-Centric Strategy for Diverse Generative Tasks [arXiv:2312.08583](https://arxiv.org/abs/2312.08583)
+
+31. Haojun Xia, Zhen Zheng, Xiaoxia Wu, Shiyang Chen, Zhewei Yao, Stephen Youn, Arash Bakhtiari, Michael Wyatt, Donglin Zhuang, Zhongzhu Zhou, Olatunji Ruwase, Yuxiong He, Shuaiwen Leon Song. (2024) FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric Algorithm-System Co-Design  [arXiv:2401.14112](https://arxiv.org/abs/2401.14112)
+
+
 
 # Videos
 1. DeepSpeed KDD 2020 Tutorial
diff --git a/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md b/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md
new file mode 100644
index 000000000000..8273ff3a51a7
--- /dev/null
+++ b/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md
@@ -0,0 +1,143 @@
+<div align="center">
+
+# DeepSpeed-FP6：大型语言模型中以FP6为核心的强大推理服务
+
+</div>
+
+<div align="center">
+
+<img src="./assets/hero-figure.png" width="1000px" alt="DeepSpeed-VisualChat!"/>
+
+</div>
+
+
+要引用DeepSpeed-FP6，请引用以下两篇arxiv报告 - ZeroQuant(4+2) 和 FP6-LLM:
+
+```
+@article{wu2023zeroquant,
+  title={Zeroquant(4+2): Redefining llms quantization with a new fp6-centric strategy for diverse generative tasks},
+  author={Wu, Xiaoxia and Xia, Haojun and Youn, Stephen and Zheng, Zhen and Chen, Shiyang and Bakhtiari, Arash and Wyatt, Michael and Aminabadi, Reza Yazdani and He, Yuxiong and Ruwase, Olatunji and Song, Leon and others},
+  journal={arXiv preprint arXiv:2312.08583},
+  year={2023}
+}
+
+@article{xia2024fp6,
+  title={FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric Algorithm-System Co-Design},
+  author={Xia, Haojun and Zheng, Zhen and Wu, Xiaoxia and Chen, Shiyang and Yao, Zhewei and Youn, Stephen and Bakhtiari, Arash and Wyatt, Michael and Zhuang, Donglin and Zhou, Zhongzhu and others},
+  journal={arXiv preprint arXiv:2401.14112},
+  year={2024}
+}
+```
+
+
+# Table of Contents
+1. [为什么选择6位浮点(FP6)](#introduction)
+2. [FP6的系统支持](#system-fp6)
+3. [FP6的LLMs服务系统](#serving-llm)
+4. [如何开始](#how-to-start)
+5. [软件改进](#software-improvements)
+6. [致谢和贡献](#ac)
+
+# 1. 为什么选择6位浮点 <a name="introduction"></a>
+大型语言模型（LLMs）领域正处于迅猛发展之中，模型量化是提升推理服务性能的关键技术之一。 我们的研究旨在提高计算效率和存储空间，同时保持模型质量。
+
+**深入研究INT4的挑战**  在最近的研究成果 ZeroQuant(4+2)[1] 中, 我们探索了INT4量化技术(如GPTQ算法) 在大语言模型(LLMs)中的表现能力。虽然这些技术可以减小模型大小和参数存储量，但由于过拟合问题, 它们在更一般的许多任务中往往表现不佳，包括代码生成和摘要等更多生成任务。因此, 当前迫切需要新的方法来提高LLMs的效率和有效性。
+
+ **FP6的突破**  我们对不同量化方法的探索将我们带到了FP6精度标准。尽管FP6数据格式在当前AI硬件的高效支持中存在挑战（我们将在下一节中解决这一挑战），该格式在各种任务的性能和灵活性方面均表现出色。值得注意的是，使用FP6量化的模型，如StarCoder-15B，在代码生成方面达到了与FP16模型相当的结果，而较小的模型(如BART-406M)在摘要方面达到了标准FP16性能水平。为了提高FP6在当前主流AI硬件上的执行效率，我们提出了一种4+2新颖的FP6 GPU kernel方案。这一创新使FP6成为提高LLMs效率的有效途径。更多详细信息请参阅我们的研究论文 ZeroQuant(4+2)[1]。
+
+
+# 2. FP6的系统支持 <a name="system-fp6"></a>
+
+**开创性的全栈GPU KERNEL设计** FP6量化的一个挑战是缺乏针对这种不规则位宽的高效GPU KERNEL设计。在我们最近的研究中（FP6-LLM[2]），我们设计并实现了TC-FPx，第一个具有Tensor Core支持的用于FP6和各种量化位宽(6位、5位、3位等)的浮点权重的GPU系统设计方案，缓解了LLM推理期间的“内存墙”问题。TC-FPx打破了底层GPU硬件的限制，允许GPU支持涉及任意位宽模型权重的矩阵乘法计算。在TC-FPx中，Tensor Cores用于矩阵乘法的密集计算，而SIMT cores在运行时有效地用于权重反量化，将模型权重反量化为FP16类型，Tensor Core基于此进行计算。它具有以下关键创新:
+<div align="center">
+  <img src="./assets/fp6-design.png" alt="fp6 design" width="600"/>
+
+</div>
+
+* 运行前比特层级的数据排布转换。用以解决权重具有不规则位宽时不友好的内存访问挑战，实现GPU内存的最优访问；
+
+* 运行时的高效SIMT计算。用以最小化权重反量化的运行时开销；
+
+* 全栈的高效流水线设计。其SIMT计算、Tensor Core计算和GPU内存访问进行高效调度，最大程度提升性能。
+
+
+
+平均而言，我们的FP6 kernel在NVIDIA A100 GPU上进行（因decoder的矩阵形状狭长而导致参数矩阵的访存成为瓶颈的）矩阵乘法时，处理速度比FP16 cuBLAS基准提高了2.1倍。值得注意的是，通过FP6量化实现的FP6内核使LLaMA-70b模型能够在单个A100 GPU上运行。这一显著成就使得其在batch小于32的LLM推理任务中，性能比FP16基准高出1.69到2.65倍。目前，TC-FPx内核仅支持NVIDIA Ampere GPU，并且仅在A100 GPU上进行了测试和验证。
+
+
+# 3. 使用FP6服务LLMs <a name="serving-llm"></a>
+
+我们已成功将FP6量化内核[3]集成到DeepSpeed-FastGen中，实现了运行时的即时量化。这一增强功能允许通过DeepSpeed-FastGen中的统一配置选项来高效量化和部署大型语言模型。通过我们的接口，用户可以输入HuggingFace模型名称或本地checkpoint目录。输入后，我们的系统将启动指定模型的加载，对每个线性层实现FP6量化，并将量化的权重进行比特层级的数据排布转换。转换后的张量随后作为更新后的权重，而原始的FP16权重被丢弃以优化内存使用。在推理阶段，FP6内核将利用这些6位的权重进行计算。
+
+我们在两个A100 GPU-80G上评估了LLaMA-2-70b模型使用FP6量化的服务性能，实现了1.5倍的推理延迟减少和3.5倍的推理吞吐量增加，与FP16基线相比。FP6量化为模型推理提供了两个关键好处：它使大型语言模型（LLMs）能够在更少的GPU上部署——例如，LLaMA-70b在单个A100-80G GPU上就能以FP6形式运行，而FP16模型至少需要两个GPU。此外，它显著加快了小batch之下内存访问为瓶颈的线性层计算。此外，FP6量化减少了模型权重的GPU内存需求，允许同时服务更多查询，从而提高了服务吞吐量。
+
+我们的系统在处理长序列生成时表现出很高的效率。如图1所示，对于超过提示长度的生成长度，我们的系统展现出显著的性能优势。随着生成序列长度的延伸，FP6与FP16之间的性能差异加大。这一趋势主要归因于解码长度扩展时，推理过程变得越来越受内存访问瓶颈限制，有利于我们的权重量化的GPU kernel，相对于FP16实现更大的kernel速度提升。需要强调的是，较长解码场景中内存访问瓶颈增强的两个因素如下：
+
+首先，KV缓存的内存使用随序列长度增加而增加，减少了可容纳的batch大小并导致线性层的矩阵计算瓶颈变为参数的访存。
+
+其次，在DeepSpeed-FastGen的prefill-decoding-mixed-batch技术背景下，对于decoding较长的情况，用于和decoding进行mixed-batching的prefill切块会相对不足，这导致纯粹用于decoding的batch频率增加，进一步加剧了访存的瓶颈。
+<p align="center">
+  <img src="./assets/servingllm/100-250.png" alt="Caption1" width="30%">
+  <img src="./assets/servingllm/100-500.png" alt="Caption2" width="30%">
+  <img src="./assets/servingllm/100-1000.png" alt="Caption3" width="30%">
+</p>
+
+图1：在DeepSpeed-MII中，使用128个请求和32个客户端，对LLaMA-2-70B模型在2xA100-80g上进行端到端服务性能测试。我们尝试了128、256和512之间不同数量的请求，发现加速效果相似。
+
+尽管FP6量化带来了显著的好处，但当前实现仍面临一些限制。值得注意的是，在GEMM因batch较大或有充足的GPU内存而使得瓶颈变为Tensor Core计算时，我们的仅限权重的量化kernel可能无法保持其性能优势，尤其是与厂商的优化库如cuBlas相比。然而，我们系统的低内存占用仍是一个关键优势。目前的支持限于非混合专家（Non-MoE）结构，我们正在努力将支持扩展到MoE结构。此外，当前系统仅与FP16输入模型兼容，因为当前实现的FP6 kernel仅支持处理FP16的激活。
+
+</div>
+
+# 4. 如何开始 <a name="how-to-start"></a>
+
+DeepSpeed-FP6的量化和推理体验简单方便。这里我们以LLaMa-2-70B模型为例：
+```python
+import mii
+pipe = mii.pipeline("NousResearch/Llama-2-70b-hf", quantization_mode='wf6af16')
+response = pipe(["DeepSpeed is", "Seattle is"], max_new_tokens=128)
+print(response)
+```
+
+您需要安装以下内容
+
+```
+pip install deepspeed-mii
+pip install qtorch
+```
+
+要使用我们的DeepSpeed-FP6进行基准测试，请访问以下脚本：
+```bash
+https://github.com/microsoft/DeepSpeedExamples/blob/master/benchmarks/inference/mii/run_fp6.sh
+```
+
+也请访问[FP6-LLM github](https://github.com/usyd-fsalab/fp6_llm) 获取FP6的独立kernel。不要忘了给仓库加星标以表达您的支持！
+
+
+# 5. 软件改进  <a name="software-improvements"></a>
+
+
+我们的DeepSpeed-FP6目前仅支持线性GEMM。我们期待未来能够支持MoE GEMM。我们将继续根据您的反馈和支持改进DeepSpeed-FP6。DeepSpeed-FP6是更大DeepSpeed生态系统的一部分，包括一系列深度学习系统和建模技术。要了解更多，
+
+* 请访问我们的 [网站](https://www.deepspeed.ai/) 了解详细的博客文章、教程和文档。
+* 在我们的 [英文 X(Twitter)](https://twitter.com/MSFTDeepSpeed)、[日语 X(Twitter)](https://twitter.com/MSFTDeepSpeedJP) 和 [中文知乎](https://www.zhihu.com/people/deepspeed) 上关注我们，以获取 DeepSpeed 的最新消息。
+
+我们欢迎您为 DeepSpeed 做出贡献！我们鼓励您报告问题、贡献 PRs、并在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 页面上参加讨论。有关更多详细信息，请查看我们的 [贡献指南](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)。我们对与大学、研究实验室、公司等进行合作持开放态度，例如共同进行深度学习研究、应用 DeepSpeed 为现实世界的 AI 模型和应用提供支持等等。对于此类请求（以及其他不适合 GitHub 的请求），请直接发送电子邮件至 deepspeed-info@microsoft.com。
+
+* 如果你喜欢我们的工作，请在[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/)， [DeepSpeed-MII GitHub](https://github.com/microsoft/DeepSpeed-MII/) 和 [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/)仓库“点赞”！
+
+
+# 6. 致谢和贡献 <a name="ac"></a>
+我们感谢悉尼大学和罗格斯大学的合作。我们还感谢开源库 [aspuru-guzik-group/qtorch](https://github.com/aspuru-guzik-group/qtorch).
+
+贡献:
+Xiaoxia Wu\* $^1$, Zhen Zheng\* $^1$, Haojun Xia\* $^2$, Arash Bakhtiari $^1$, Michael Wyatt $^1$, Shiyang Chen $^3$, Stephen Youn $^1$, Reza Yazdani Aminabadi, Yuxiong He, Olatunji Ruwase $^1$,  Zhewei Yao, Leon Song  $^1$ $^2$（项目负责人）
+
+\* 平等贡献 1: 微软 2: 悉尼大学 3: 罗格斯大学
+
+文献:
+
+[1] ZeroQuant(4+2): Redefining LLMs Quantization with a New FP6-Centric Strategy for Diverse Generative Tasks. arXiv. https://arxiv.org/abs/2312.08583
+
+[2] FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric Algorithm-System Co-Design. arXiv. https://arxiv.org/abs/2401.14112
+
+[3] FP6-LLM kernel release. GitHub. https://github.com/usyd-fsalab/fp6_llm
diff --git a/blogs/deepspeed-fp6/03-05-2024/README.md b/blogs/deepspeed-fp6/03-05-2024/README.md
new file mode 100755
index 000000000000..dbd6b2d081aa
--- /dev/null
+++ b/blogs/deepspeed-fp6/03-05-2024/README.md
@@ -0,0 +1,147 @@
+<div align="center">
+
+# DeepSpeed-FP6: The Power of FP6-Centric Serving for Large Language Models
+
+</div>
+
+<div align="center">
+
+<img src="./assets/hero-figure.png" width="1000px" alt="DeepSpeed-VisualChat!"/>
+
+</div>
+
+
+To cite DeepSpeed-FP6, please cite the following two arxiv reports - ZeroQuant(4+2) and FP6-LLM:
+
+```
+@article{wu2023zeroquant,
+  title={Zeroquant(4+2): Redefining llms quantization with a new fp6-centric strategy for diverse generative tasks},
+  author={Wu, Xiaoxia and Xia, Haojun and Youn, Stephen and Zheng, Zhen and Chen, Shiyang and Bakhtiari, Arash and Wyatt, Michael and Aminabadi, Reza Yazdani and He, Yuxiong and Ruwase, Olatunji and Song, Leon and others},
+  journal={arXiv preprint arXiv:2312.08583},
+  year={2023}
+}
+
+@article{xia2024fp6,
+  title={FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric Algorithm-System Co-Design},
+  author={Xia, Haojun and Zheng, Zhen and Wu, Xiaoxia and Chen, Shiyang and Yao, Zhewei and Youn, Stephen and Bakhtiari, Arash and Wyatt, Michael and Zhuang, Donglin and Zhou, Zhongzhu and others},
+  journal={arXiv preprint arXiv:2401.14112},
+  year={2024}
+}
+```
+
+
+# Table of Contents
+1. [Why 6-bit Floating Point (FP6)](#introduction)
+2. [System Support for FP6](#system-fp6)
+3. [LLMs Serving with FP6](#serving-llm)
+4. [How to Start](#how-to-start)
+5. [Software Improvements](#software-improvements)
+6. [Acknowledgments and Contributions](#ac)
+
+# 1. Why 6-bit Floating Point (FP6) <a name="introduction"></a>
+
+
+In the evolving landscape of Large Language Models (LLMs) like GPT, our research aims to boost computational efficiency and storage while preserving model quality. This focus brings us to tackle the complex challenges of 4-bit quantization, where optimizing performance, efficiency, and accuracy is crucial.
+
+**Exploring the Challenges of 4-bit Quantization** In our recent research findings -- ZeroQuant (4+2)[1], we explore the capabilities of INT4 quantization techniques (like the GPTQ algorithm) for serving Large Language Models (LLMs). While these techniques reduce memory and computational requirements, they often perform poorly on a broad array of tasks, including generative tasks such as code generation and summarization, due to overfitting issues. This highlights the urgent need for new quantization approaches that simultanenously improve both the efficiency and effectiveness of LLMs.
+
+**Breakthroughs with FP6 Precision** Our exploration of different quantization methods led us to the FP6 precision standard. Despite the challenges in integrating and accelerating FP6 with current AI hardware -- which we will address in the next section - this format excels in performance and flexibility across various tasks. Notably, we observe that for generative tasks, FP6 quantization can match the performance of the half-precision (FP16) format. For example, with FP6 quantization, StarCoder-15B achieves comparable code generation results to the FP16 variant, while a smaller model, such as BART-460M, achieves comparable summarization performance to the standard FP16 equivalent. In order to preserve these quality gains, while matching the system efficiency of INT4 quantization on AI hardware, we propose a novel 4+2 FP6 scheme. This innovation makes FP6 a promising direction for improving the efficiency of LLMs, marking a significant leap in AI technology advancement. For more details, please refer to our research paper - ZeroQuant (4+2)[1].
+
+
+# 2. System Support for FP6 <a name="system-fp6"></a>
+
+**Pioneering Full-Stack GPU Kernel Design** A key challenge of FP6 quantization is the lack of efficient GPU kernel designs for this irregular, i.e., "non-power of 2", bit-width. In our recent research — FP6-LLM [2], we introduce TC-FPx, the first full-stack GPU system design scheme with unified Tensor Core support of floating point weights for FP6 and other irregular quantization bit-widths (6-bit, 5-bit, 3-bit, etc.). TC-FPx breaks the limitations of the underlying GPU hardware, allowing the GPU to support linear layer calculations on model weights of arbitrary bit width. By increasing the number of bit-width options for efficient quantization, TC-FPx significantly mitigates the "memory wall" challenges of LLM inference. In TC-FPx, Tensor Cores are utilized for intensive computation of matrix multiplications, while SIMT cores are effectively leveraged for weight dequantization, transforming the x-bit model weights to FP16 type during runtime before feeding them to Tensor Cores. It has the following key innovations:
+<div align="center">
+  <img src="./assets/fp6-design.png" alt="fp6 design" width="600"/>
+
+</div>
+
+* *Ahead-of-time Bit-level Pre-packing*: resolve the challenge of unfriendly memory access for weights with irregular bit-width, and enable optimal GPU memory access.
+
+* *SIMT-Efficient GPU Runtime*: minimize the runtime overhead of weight de-quantization.
+
+* *The software pipeline of TC-FPx kernel*: efficiently utilize SIMT cores, Tensor Cores, and the GPU memory hierarchy for high performance.
+
+
+
+On average, the TC-FPx kernel demonstrates a 2.1-fold improvement in processing speed over the FP16 cuBLAS benchmark during memory-intensive General Matrix Multiply (GEMM) operations on NVIDIA A100 GPUs. Notably, the implementation of the FP6 kernel through FP6 quantization facilitates the operation of LLaMA-70b on a solitary A100 GPU. This remarkable feat results in a normalized inference throughput that is 1.69 to 2.65 times superior to the FP16 benchmark when conducting inference tasks with batch-size under 32. Currently, TC-FPx kernel only supports NVIDIA Ampere GPUs and is only tested and verified on A100 GPUs
+
+
+# 3. LLMs serving with FP6 <a name="serving-llm"></a>
+
+We have successfully integrated the FP6 quantization kernel [3] into DeepSpeed-FastGen, facilitating on-the-fly, weight-only quantization. This enhancement permits the efficient quantization and deployment of large language models (LLMs) through a unified configuration option within DeepSpeed-FastGen. Detailed information regarding this feature will be provided in due course. Through our interface, users have the flexibility to load a model checkpoint from either HuggingFace hub or a local directory. While loading the checkpoint, our system applies FP6 round-to-nearest quantization on each linear layer, and transforms the quantized weights into 6-bit prepacked tensors. These tensors will serve as the model weights for inference, while the original FP16 weights are discarded to release memory. Throughout the inference stage, the FP6 kernels leverage the 6-bit prepacked weights, ensuring a seamless experience for users engaging with our platform.
+
+We assessed the LLaMA-70b model's serving performance using FP6 quantization on two A100 GPUs-80G, and observed a *1.5x* reduction in inference latency and a *3.5x* increase in inference throughput compared to the FP16 baseline. FP6 quantization offers two key benefits for model inference: it enables the deployment of large language models (LLMs) on fewer GPUs — for instance, LLaMA-70b fits on a single A100-80G GPU with FP6, versus at least two GPUs required for the FP16 baseline. Additionally, it significantly accelerates linear layers in memory-bound scenarios, which are common in LLM inference. Moreover, FP6 quantization reduces the GPU memory requirements for model weights, allowing for more queries to be served simultaneously, and thus increasing serving throughput.
+
+Our system demonstrates exceptional efficiency in handling long generation sequences. As illustrated in Figure 1, for generation lengths surpassing the prompt length, our system exhibits a notable performance superiority. The disparity in performance between FP6 and the FP16 baseline widens with the extension of the generation sequence length. This trend is primarily attributed to the inference process becoming increasingly memory-constrained as the decoding length expands, favoring our weight-quantized GPU kernels by facilitating faster compute compared to the FP16 baseline. It is important to highlight two factors contributing to the increased memory constraints in longer decoding scenarios.
+ - Firstly, the memory usage for the KV cache escalates with the sequence length, reducing the feasible batch sizes and leading to memory-bound GEMM  operations.
+ - Secondly, within the context of DeepSpeed-FastGen's prefill-decoding-mixed-batch technique, scenarios involving extended token generation encounter a reduction in prefill-chunks available for mixing with decodings. This results in a higher frequency of batches dedicated solely to decodings, further intensifying the memory-bound conditions.
+
+<p align="center">
+  <img src="./assets/servingllm/100-250.png" alt="Caption1" width="30%">
+  <img src="./assets/servingllm/100-500.png" alt="Caption2" width="30%">
+  <img src="./assets/servingllm/100-1000.png" alt="Caption3" width="30%">
+</p>
+
+  *Figure 1*:  End-to-end serving performances in DeepSpeed-MII with 32 clients and total of 128 requests, for LLaMA-2-70B model on 2xA100-80g with two-way tensor parallelism. We experimented with different number of requests between 128, 256 and 512 and found that the speedup is simillar.
+
+Despite the significant benefits of FP6 quantization, the current implementation faces limitations. Notably, in scenarios where GEMM operations become compute-bound due to large batch sizes or sufficient GPU memory, our weight-only quantization kernel may not sustain its latency advantage, especially against optimized libraries like cuBlas. However, our system's memory efficiency remains a key benefit. Currently, support is limited to Non-Mixture of Experts (Non-MoE) structures, with efforts underway to extend support to MoE structures. Additionally, the system is compatible only with FP16 input models, as the FP6 kernel processes FP16 activations exclusively.
+
+</div>
+
+# 4. How to begin with DeepSpeed-FP6  <a name="how-to-start"></a>
+
+The quantization-and-inference experience of DeepSpeed-FP6 is straightforward and convenient. Here we give an example based on LLaMa-2-70B model:
+
+```python
+import mii
+pipe = mii.pipeline("NousResearch/Llama-2-70b-hf", quantization_mode='wf6af16')
+response = pipe(["DeepSpeed is", "Seattle is"], max_new_tokens=128)
+print(response)
+```
+
+You need to install the following:
+```
+pip install deepspeed-mii
+pip install qtorch
+```
+
+To benchmark with our DeepSpeed-FP6, please visit the following script:
+```bash
+https://github.com/microsoft/DeepSpeedExamples/blob/master/benchmarks/inference/mii/run_fp6.sh
+```
+
+Please also visit the [FP6-LLM github](https://github.com/usyd-fsalab/fp6_llm) for the standalone kernel of FP6.  Don't forget to star the repo to show your support!
+
+
+# 5. Software Improvements  <a name="software-improvements"></a>
+
+
+Currently, DeepSpeed-FP6 supports only dense models with MoE models support upcoming. We will continue to improve DeepSpeed-FP6 with your feedback and support. DeepSpeed-FP6 is a component of the larger DeepSpeed ecosystem, which includes a range of Deep Learning systems and modeling technologies. To learn more,
+
+* Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
+* Follow us on our [English X(Twitter)](https://twitter.com/MSFTDeepSpeed), [Japanese X(Twitter)](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
+
+We welcome your contributions to DeepSpeed! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
+
+* "Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeed-MII GitHub](https://github.com/microsoft/DeepSpeed-MII/) and [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/)  repositories if you like our work!
+
+
+# 6. Acknowledgments and Contributions <a name="ac"></a>
+We thank the collaboration of the University of Sydney and Rutgers University. We also thank the open-source library [aspuru-guzik-group/qtorch](https://github.com/aspuru-guzik-group/qtorch).
+
+Contributions:
+Xiaoxia Wu\* $^1$, Zhen Zheng\* $^1$, Haojun Xia\* $^2$, Arash Bakhtiari $^1$, Michael Wyatt $^1$, Shiyang Chen $^3$, Stephen Youn $^1$, Reza Yazdani Aminabadi, Yuxiong He, Olatunji Ruwase $^1$,  Zhewei Yao, Leon Song $^1$ $^2$ (project lead)
+
+\* Equal Contribution
+1: Microsoft
+2: University of Sydney
+3: Rutgers University
+
+Reference:
+
+[1] ZeroQuant(4+2): Redefining LLMs Quantization with a New FP6-Centric Strategy for Diverse Generative Tasks. arXiv. https://arxiv.org/abs/2312.08583
+
+[2] FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric Algorithm-System Co-Design. arXiv. https://arxiv.org/abs/2401.14112
+
+[3] FP6-LLM kernel release. GitHub. https://github.com/usyd-fsalab/fp6_llm
diff --git a/blogs/deepspeed-fp6/03-05-2024/assets/fp6-design.png b/blogs/deepspeed-fp6/03-05-2024/assets/fp6-design.png
new file mode 100644
index 000000000000..5024332a8f33
Binary files /dev/null and b/blogs/deepspeed-fp6/03-05-2024/assets/fp6-design.png differ
diff --git a/blogs/deepspeed-fp6/03-05-2024/assets/hero-figure.png b/blogs/deepspeed-fp6/03-05-2024/assets/hero-figure.png
new file mode 100644
index 000000000000..61a5061dc954
Binary files /dev/null and b/blogs/deepspeed-fp6/03-05-2024/assets/hero-figure.png differ
diff --git a/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-1000.png b/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-1000.png
new file mode 100644
index 000000000000..c1095ee0053b
Binary files /dev/null and b/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-1000.png differ
diff --git a/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-250.png b/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-250.png
new file mode 100644
index 000000000000..aeeaab55466d
Binary files /dev/null and b/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-250.png differ
diff --git a/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-500.png b/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-500.png
new file mode 100644
index 000000000000..eb3c1ac12a7b
Binary files /dev/null and b/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-500.png differ
diff --git a/deepspeed/inference/v2/config_v2.py b/deepspeed/inference/v2/config_v2.py
index 64e7e29b1844..85e4b7a0e0a0 100644
--- a/deepspeed/inference/v2/config_v2.py
+++ b/deepspeed/inference/v2/config_v2.py
@@ -3,8 +3,8 @@
 
 # DeepSpeed Team
 
+from typing import Optional
 from deepspeed.pydantic_v1 import Field
-
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from .ragged import DSStateManagerConfig
 
@@ -16,6 +16,16 @@ class DeepSpeedTPConfig(DeepSpeedConfigModel):
     """ Number of devices to split the model across using tensor parallelism. """
 
 
+class QuantizationConfig(DeepSpeedConfigModel):
+    """ Configure tensor parallelism settings """
+
+    quantization_mode: Optional[str] = None
+    """ The quantization mode in string format. The supported modes are as follows:
+        - 'wf6af16', weight-only quantization with FP6 weight and FP16 activation.
+    """
+    # TODO: may reuse the constants in deepspeed/compression/constants.py
+
+
 class RaggedInferenceEngineConfig(DeepSpeedConfigModel):
     """ Sets parameters for DeepSpeed Inference Engine. """
 
@@ -29,3 +39,5 @@ class RaggedInferenceEngineConfig(DeepSpeedConfigModel):
     """
     Configuration for managing persistent state
     """
+
+    quantization: QuantizationConfig = {}
diff --git a/deepspeed/inference/v2/engine_factory.py b/deepspeed/inference/v2/engine_factory.py
index 9281640f844a..c320108f55e5 100644
--- a/deepspeed/inference/v2/engine_factory.py
+++ b/deepspeed/inference/v2/engine_factory.py
@@ -21,6 +21,7 @@
     FalconPolicy,
     PhiPolicy,
     QwenPolicy,
+    Qwen2Policy,
 )
 from .model_implementations.inference_policy_base import POLICIES, InferenceV2Policy
 from .model_implementations.flat_model_helpers import make_metadata_filename, ModelMetadata
@@ -120,6 +121,8 @@ def build_hf_engine(path: str,
             policy = PhiPolicy(model_config, checkpoint_engine=checkpoint_engine)
         elif model_config.model_type == "qwen":
             policy = QwenPolicy(model_config, checkpoint_engine=checkpoint_engine)
+        elif model_config.model_type == "qwen2":
+            policy = Qwen2Policy(model_config, checkpoint_engine=checkpoint_engine)
         else:
             raise ValueError(f"Unsupported model type {model_config.model_type}")
 
diff --git a/deepspeed/inference/v2/kernels/core_ops/__init__.py b/deepspeed/inference/v2/kernels/core_ops/__init__.py
index bbb53e5b58a2..1d16b484a560 100644
--- a/deepspeed/inference/v2/kernels/core_ops/__init__.py
+++ b/deepspeed/inference/v2/kernels/core_ops/__init__.py
@@ -8,3 +8,4 @@
 from .cuda_layer_norm import *
 from .cuda_rms_norm import *
 from .gated_activations import *
+from .cuda_linear import *
diff --git a/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp b/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp
index 58df88e56136..2397b0694696 100644
--- a/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp
+++ b/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp
@@ -8,6 +8,7 @@
 
 #include "bias_activation.h"
 #include "blas.h"
+#include "cuda_linear_kernels.h"
 #include "gated_activation_kernels.h"
 #include "layer_norm.h"
 #include "rms_norm.h"
@@ -33,4 +34,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     // rms_norm.h
     m.def("rms_norm", &rms_norm, "DeepSpeed rms norm in CUDA");
     m.def("rms_pre_norm", &rms_pre_norm, "DeepSpeed rms pre norm in CUDA");
+
+    // cuda_linear_kernels.h
+    m.def("cuda_wf6af16_linear", &cuda_wf6af16_linear, "DeepSpeed Wf6Af16 linear in CUDA");
+    m.def(
+        "preprocess_weight", &preprocess_weight, "preprocess the FP16 weight to be 2bit and 4 bit");
 }
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/__init__.py b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/__init__.py
new file mode 100644
index 000000000000..cd08409c0a7a
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .cuda_linear import *
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear.py b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear.py
new file mode 100644
index 000000000000..69aa9e8920e2
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear.py
@@ -0,0 +1,207 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ....inference_utils import DtypeEnum
+from ....logging import inference_logger
+from deepspeed.ops.op_builder import InferenceCoreBuilder
+from ... import DSKernelBase
+
+
+class CUDAWf6Af16Linear(DSKernelBase):
+    """
+    Wrapper around the CUDA kernel of Wf6Af16 quantized linear.
+
+    Performs z = x @ y
+    """
+    supported_dtypes = [DtypeEnum.fp16]
+
+    def __init__(self):
+        self.inf_module = InferenceCoreBuilder().load()
+        self.inf_module.create_handle()
+        self.kernel = self.inf_module.cuda_wf6af16_linear
+        # The split_k_map is profiled on A100-80G GPU for some common shapes.
+        # It is an array of dictionaries, where the array index is the tokens chunk id.
+        # The dictionary is the mapping from the output channel to the split-K size.
+        self.split_k_map = [
+            {  # tokens: [1, 64]
+                3072: 18,
+                4096: 13,
+                5120: 10,
+                6144: 9,
+                8192: 6,
+                10240: 5,
+                14336: 7,
+                28672: 7,
+                57344: 7
+            },
+            {  # tokens: [65:128]
+                3072: 9,
+                4096: 6,
+                5120: 5,
+                6144: 9,
+                8192: 3,
+                10240: 5,
+                14336: 7,
+                28672: 7,
+                57344: 6
+            },
+            {  # tokens: [129:192]
+                3072: 6,
+                4096: 4,
+                5120: 7,
+                6144: 3,
+                8192: 2,
+                10240: 5,
+                14336: 5,
+                28672: 5,
+                57344: 4
+            },
+            {  # tokens: [193:256]
+                3072: 9,
+                4096: 3,
+                5120: 5,
+                6144: 2,
+                8192: 5,
+                10240: 4,
+                14336: 8,
+                28672: 6,
+                57344: 4
+            },
+            {  # tokens: [257:320]
+                3072: 7,
+                4096: 5,
+                5120: 2,
+                6144: 5,
+                8192: 4,
+                10240: 1,
+                14336: 3,
+                28672: 3,
+                57344: 4
+            },
+            {  # tokens: [321:384]
+                3072: 3,
+                4096: 2,
+                5120: 5,
+                6144: 3,
+                8192: 1,
+                10240: 8,
+                14336: 3,
+                28672: 4,
+                57344: 3
+            },
+            {  # tokens: [385:448]
+                3072: 5,
+                4096: 7,
+                5120: 3,
+                6144: 5,
+                8192: 7,
+                10240: 3,
+                14336: 1,
+                28672: 1,
+                57344: 3
+            },
+            {  # tokens: [449:512]
+                3072: 2,
+                4096: 5,
+                5120: 4,
+                6144: 1,
+                8192: 5,
+                10240: 2,
+                14336: 6,
+                28672: 4,
+                57344: 1
+            },
+            {  # tokens: [513:576]
+                3072: 2,
+                4096: 3,
+                5120: 1,
+                6144: 1,
+                8192: 3,
+                10240: 3,
+                14336: 3,
+                28672: 1,
+                57344: 1
+            },
+            {  # tokens: [577:640]
+                3072: 5,
+                4096: 4,
+                5120: 1,
+                6144: 4,
+                8192: 2,
+                10240: 1,
+                14336: 1,
+                28672: 1,
+                57344: 1
+            },
+            {  # tokens: [641:704]
+                3072: 3,
+                4096: 1,
+                5120: 2,
+                6144: 2,
+                8192: 1,
+                10240: 2,
+                14336: 1,
+                28672: 1,
+                57344: 1
+            },
+            {  # tokens: [705:768]
+                3072: 3,
+                4096: 1,
+                5120: 3,
+                6144: 2,
+                8192: 1,
+                10240: 1,
+                14336: 1,
+                28672: 1,
+                57344: 1
+            }
+        ]
+
+    def __call__(self, output: torch.Tensor, hidden_states: torch.Tensor, weights_2bit: torch.Tensor,
+                 weights_4bit: torch.Tensor, scale: torch.Tensor, out_channels, tokens, in_channels) -> torch.Tensor:
+        """
+        Matmul kernel of FP6 weight-only quantized linear. All inputs should be contiguous.
+        It does not support batched-matmul.
+
+        Parameters:
+            output (torch.Tensor): Output tensor. Shape is of [token_number, out_features]
+            hidden_states (torch.Tensor): Input tensor. Shape is of [token_number, in_features]
+            weights_2bit (torch.Tensor): Input tensor of the 2-bit slice. Shape is of [out_features*2/8, in_features]
+            weights_4bit (torch.Tensor): Input tensor of the 4-bit slice. Shape is of [out_features*4/8, in_features]
+            scale (torch.Tensor): Input tensor. Shape is of [out_features], since the scale is per output channel
+            out_channels (int): The number of output channels
+            tokens (int): The number of tokens
+            in_channels (int): The number of input channels
+        """
+
+        if out_channels % 256 != 0 or in_channels % 64 != 0:
+            raise ValueError("The out and in channel should be multiple of 256 and 64 respectively.")
+
+        # TODO: add a more general heuristic to determine the split-K.
+        split_k = -1  # not initialized
+        if tokens <= 768:
+            # Try to find the split-K from the pre-profiled map.
+            tokens_chunk_id = (tokens - 1) // 64
+            split_k = self.split_k_map[tokens_chunk_id].get(out_channels, -1)
+        if split_k == -1:
+            split_k = 1
+            inference_logger().warning(
+                f"The split-K setting may be suboptimal for shape {tokens}x{in_channels}x{out_channels}...")
+
+        workspace = self.get_workspace(out_channels, tokens, in_channels, split_k, torch.float, hidden_states.device)
+        self.kernel(output, hidden_states, weights_2bit, weights_4bit, scale, workspace, out_channels, tokens,
+                    in_channels, split_k)
+
+    def get_workspace(self, out_channels: int, tokens: int, in_channels: int, split_k: int, dtype,
+                      device) -> torch.Tensor:
+        """
+        Allocate workspace for the kernel. The workspace is used to store the intermediate results of the matmul before
+        split-K. The split-K size is determined by the size of the matmul.
+        """
+        workspace = torch.empty((split_k, out_channels, tokens), dtype=dtype, device=device)
+
+        return workspace
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp
new file mode 100644
index 000000000000..677bec22ded8
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp
@@ -0,0 +1,224 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cuda_linear_kernels.h"
+
+namespace {
+
+// For bit-level debugging.
+template <typename T>
+void print_bits(T num)
+{
+    char bits[sizeof(T) * 8 + 1] = {'\0'};
+    for (int bit = 0; bit < (sizeof(T) * 8); bit++) {
+        bits[sizeof(T) * 8 - 1 - bit] = '0' + (num & 0x01);
+        num = num >> 1;
+    }
+    printf("%s\n", bits);
+}
+
+void print_bits(half num)
+{
+    char bits[sizeof(half) * 8 + 1] = {'\0'};
+    auto int_num = *reinterpret_cast<uint16_t*>(&num);
+    for (int bit = 0; bit < (sizeof(half) * 8); bit++) {
+        bits[sizeof(half) * 8 - 1 - bit] = '0' + (int_num & 0x01);
+        int_num = int_num >> 1;
+    }
+    printf("%s\n", bits);
+}
+
+/*
+ * Function to pack 4 fake quantized FP16 value into continuously stored 4 FP6 values.
+ */
+void cast_fp16_fp6(uint16_t* FP16x4, uint8_t* FP6x4)
+{
+    // Constants for FP6
+    constexpr int exponent_nbits_fp6 = 3;
+    constexpr int mantissa_nbits_fp6 = 2;
+    constexpr int exp_bias_fp6 = (1 << (exponent_nbits_fp6 - 1)) - 1;
+    // Constants for FP16
+    constexpr int exponent_nbits_fp16 = 5;
+    constexpr int mantissa_nbits_fp16 = 10;
+    constexpr int exp_bias_fp16 = (1 << (exponent_nbits_fp16 - 1)) - 1;
+
+    int fp6_temp[4];
+
+    float absmin_nonzero_fp6 = 0.0625;
+    // Note that we regard the exponent of '111' as a regular value rather than NaN or inf. This is
+    // the same with that in qtorch.
+    float absmax_fp6 = 28;
+
+    for (int i = 0; i < 4; ++i) {
+        uint16_t source = FP16x4[i];
+        float fp6_value_abs = std::abs(__half2float(*((half*)(&source))));
+        if ((fp6_value_abs != 0 && fp6_value_abs < absmin_nonzero_fp6) ||
+            fp6_value_abs > absmax_fp6) {
+            // TODO(zhen): a better way may be rounding it to the nearest FP6 value.
+            throw std::invalid_argument("Input value out of range for FP6.");
+        }
+
+        // It is not safe to do shift operation on uint16_t. So we promote it to int.
+        int source_promote = int(source);
+
+        int sign_bit = (source_promote >> 15);
+        // Extracting exponent represented in FP16. The sign mask 0x7FFF is '0111 1111 1111 1111'
+        int exp_bit = (source_promote & 0x7FFF) >> mantissa_nbits_fp16;
+        // Extracting mantissa represented in FP16
+        int mant_bit = source_promote & ((1 << mantissa_nbits_fp16) - 1);
+
+        int new_exp_bit;
+        int new_mant_bit;
+
+        if (exp_bit == 0) {
+            // Subnormal FP16 number. Too small for FP6.
+            new_exp_bit = 0;
+            new_mant_bit = 0;
+        } else {
+            new_mant_bit = mant_bit >> (mantissa_nbits_fp16 - mantissa_nbits_fp6);
+            new_exp_bit = exp_bit - exp_bias_fp16 + exp_bias_fp6;
+
+            // Deal with subnormal FP6 values.
+            int target_exp_val = exp_bit - exp_bias_fp16;
+            int min_fp6_exp_val = -exp_bias_fp6 + 1;
+            bool subnormal_fp6 = target_exp_val < min_fp6_exp_val;
+            if (subnormal_fp6) {
+                // TODO(zhen): add the rounding logic.
+                new_exp_bit = 0;
+                // The implicit 1 in the mantissa of FP16 is not present in subnormal FP6. Thus we
+                // need to add it
+                new_mant_bit = (new_mant_bit | (1 << mantissa_nbits_fp6)) >>
+                               (min_fp6_exp_val - target_exp_val);
+            }
+        }
+
+        fp6_temp[i] = (sign_bit << (exponent_nbits_fp6 + mantissa_nbits_fp6)) |
+                      (new_exp_bit << mantissa_nbits_fp6) | new_mant_bit;
+    }
+    // Pack the values
+    FP6x4[0] = fp6_temp[0] << 2 | (fp6_temp[1] >> 4);
+    FP6x4[1] = (fp6_temp[1] & 0x0F) << 4 | (fp6_temp[2] >> 2);
+    FP6x4[2] = (fp6_temp[2] & 0x03) << 6 | fp6_temp[3];
+}
+
+/*
+ *  Function to prepack FP16 weights into continuous FP6 values.
+ *
+ *  Parameters:
+ *     weight_16bit: input weight in FP16, size M*K
+ *     weight_6bit: output weight in packed FP6, continuously stored, size M*K*6/8
+ *     M, K: the shape of the weight
+ */
+void weight_prepacking_fp16_to_fp6(uint16_t* weight_16bit,
+                                   uint8_t* weight_6bit_packed,
+                                   size_t M,
+                                   size_t K)
+{
+    // Every four 16-bit elements are packed into three 6-bit values (4*6bit == 3*8bit).
+    if (K * 6 % 8 != 0) { throw std::invalid_argument("(K * 6 % 8) should be 0"); }
+    size_t K_fp6_packed = K * 6 / 8;
+    // #pragma omp parallel for
+    for (auto m = 0; m < M; m++) {
+        uint8_t* ptr_6bit = weight_6bit_packed + m * K_fp6_packed;
+        uint16_t* ptr_16bit = weight_16bit + m * K;
+        for (auto k = 0; k < K; k += 4) {
+            cast_fp16_fp6(ptr_16bit, ptr_6bit);
+            ptr_16bit += 4;
+            ptr_6bit += 3;
+        }
+    }
+}
+
+}  // namespace
+
+/*
+ * Function to execute the FP6 linear kernel.
+ *
+ * Parameters:
+ *    output: output tensor, size M*N
+ *    hidden_states: input activation tensor, size N*K
+ *    weights_2bit: packed 2bit weights, size M*K*2/8
+ *    weights_4bit: packed 4bit weights, size M*K*4/8
+ *    scales: scale tensor, size M
+ *    workspace: workspace tensor, size M*N*split_k
+ *    M: the output channel number of the weight
+ *    N: the token number of the activation
+ *    K: the input channel number of the weight
+ *    split_k: the split size of the GEMM calculation
+ */
+void cuda_wf6af16_linear(torch::Tensor& output,
+                         torch::Tensor& hidden_states,
+                         torch::Tensor& weights_2bit,
+                         torch::Tensor& weights_4bit,
+                         torch::Tensor& scales,
+                         torch::Tensor& workspace,
+                         int M,
+                         int N,
+                         int K,
+                         int split_k)
+{
+    TORCH_CHECK(weights_2bit.device().type() == torch::kCUDA, "weight_2bit must be on CUDA");
+    TORCH_CHECK(weights_4bit.device().type() == torch::kCUDA, "weight_4bit must be on CUDA");
+    TORCH_CHECK(hidden_states.device().type() == torch::kCUDA, "X must be on CUDA");
+    TORCH_CHECK(scales.device().type() == torch::kCUDA, "scales must be on CUDA");
+
+    auto status = fp6_linear_kernel(at::cuda::getCurrentCUDAStream(),
+                                    (uint4*)(weights_2bit.data_ptr<uint8_t>()),
+                                    (uint4*)(weights_4bit.data_ptr<uint8_t>()),
+                                    (half*)(scales.data_ptr<at::Half>()),
+                                    (half*)(hidden_states.data_ptr<at::Half>()),
+                                    (half*)(output.data_ptr<at::Half>()),
+                                    M,
+                                    N,
+                                    K,
+                                    workspace.data_ptr<float>(),
+                                    split_k);
+    if (status != cudaSuccess) {
+        AT_ERROR("fp6_linear_kernel failed with error: ", cudaGetErrorString(status));
+    }
+}
+
+/*
+ * Function to prepack the fake 6-bit-quantized FP16 weights into 2bit and 4bit.
+ *
+ * Parameters:
+ *    weight: input weight in FP16 (containing the quantized FP6-ranged value), size M*K
+ * Returns:
+ *   weight_2bit: output weight in 2bit, size M*K*2/8
+ *   weight_4bit: output weight in 4bit, size M*K*4/8
+ */
+std::vector<torch::Tensor> preprocess_weight(torch::Tensor& weight)
+{
+    TORCH_CHECK(weight.dim() == 2, "weight must be 2-dimensional");
+    TORCH_CHECK(weight.scalar_type() == torch::kFloat16, "weight must be FP16");
+    TORCH_CHECK(weight.is_contiguous(), "weight must be contiguous");
+    TORCH_CHECK(weight.device().type() == torch::kCPU, "weight must be on CPU");
+    auto M = weight.size(0);
+    auto K = weight.size(1);
+    TORCH_CHECK(K % 4 == 0, "K must be multiple of 4");
+
+    // Pack weight from FP16 to FP6.
+    uint16_t* weight_16bit_ptr = reinterpret_cast<uint16_t*>(weight.data_ptr<at::Half>());
+    std::vector<uint8_t> weight_6bit_packed(M * K * 6 / 8);
+    uint8_t* weight_6bit_ptr = weight_6bit_packed.data();
+    weight_prepacking_fp16_to_fp6(weight_16bit_ptr, weight_6bit_ptr, M, K);
+
+    // Split weight into 2bit and 4bit.
+    weight_matrix_prepacking(reinterpret_cast<int*>(weight_6bit_ptr), M, K);
+    uint8_t* weight_2bit_ptr = weight_6bit_ptr;
+
+    // Make sure that the new split tensor does not share the underlying memory with the original
+    // one. Otherwise it will incur some problems when the original tensor is deleted. It also
+    // makes the memory flattern risky.
+    auto weight_2bit =
+        torch::from_blob(weight_2bit_ptr, {M * K * 2 / 8}, torch::kUInt8).clone().detach();
+    uint8_t* weight_4bit_ptr = weight_2bit_ptr + M * K * 2 / 8;
+    auto weight_4bit =
+        torch::from_blob(weight_4bit_ptr, {M * K * 4 / 8}, torch::kUInt8).clone().detach();
+
+    return {weight_2bit, weight_4bit};
+}
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.h b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.h
new file mode 100644
index 000000000000..0f5882d519ca
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.h
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+#include "ds_kernel_utils.h"
+
+#include "fp6_linear.cuh"
+
+void cuda_wf6af16_linear(torch::Tensor& output,
+                         torch::Tensor& hidden_states,
+                         torch::Tensor& weights_2bit,
+                         torch::Tensor& weights_4bit,
+                         torch::Tensor& scale,
+                         torch::Tensor& workspace,
+                         int M,
+                         int N,
+                         int K,
+                         int split_k);
+
+std::vector<torch::Tensor> preprocess_weight(torch::Tensor& Weight);
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu
new file mode 100644
index 000000000000..64e06a5435c6
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu
@@ -0,0 +1,315 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+// clang-format off
+// Put the torch headers at the front to avoid conflict with other headers on
+// `at::nullopt` and `at::optional`.
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+// clang-format on
+
+#include "include/kernel_matmul.cuh"
+#include "include/kernel_reduction.cuh"
+#include "include/weight_prepacking.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+template <typename TilingConfig, typename OutputDataType>
+static void Kernel_Ex(cudaStream_t stream,
+                      const uint4* Weight1,
+                      const uint4* Weight2,
+                      const half* Scales,
+                      const half* B,
+                      OutputDataType* C,
+                      const size_t M_Global,
+                      const size_t N_Global,
+                      const size_t K_Global,
+                      int Split_K)
+{
+#ifdef DEBUG_MODE
+    printf("\n");
+    printf("Launcher.cu->Kernel_Ex():\n");
+    printf("M: %d, N: %d, K: %d, SplitK: %d\n", M_Global, N_Global, K_Global, Split_K);
+    printf("TILE_M: %d, TILE_K: %d, TILE_N: %d\n",
+           TilingConfig::TILE_M,
+           TilingConfig::TILE_K,
+           TilingConfig::TILE_N);
+#endif
+    static size_t SHMEM_SZ =
+        max(TilingConfig::SMEM_SIZE_B_TILE + SMEM_SIZE_A1_TILE + SMEM_SIZE_A2_TILE,
+            TilingConfig::SMEM_SIZE_C_TILE);
+    cudaFuncSetAttribute(QUANT_GEMM_Kernel<TilingConfig, OutputDataType>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         SHMEM_SZ);
+    size_t dimN = (N_Global - 1) / TilingConfig::TILE_N + 1;
+    size_t dimM = M_Global * Split_K / TilingConfig::TILE_M;
+    dim3 GridDim(dimN, dimM, 1);
+    dim3 BlockDim(WARP_SIZE * TilingConfig::BLOCK_WARPS, 1, 1);
+//
+#ifdef DEBUG_MODE
+    printf(
+        "GridDim.x: %d, GridDim.y: %d, GridDim.z: %d, BlockDim.x: %d, BlockDim.y: %d, BlockDim.z: "
+        "%d SHMEM_SZ: %d\n",
+        GridDim.x,
+        GridDim.y,
+        GridDim.z,
+        BlockDim.x,
+        BlockDim.y,
+        BlockDim.z,
+        SHMEM_SZ);
+    printf("\n");
+#endif
+    QUANT_GEMM_Kernel<TilingConfig, OutputDataType><<<GridDim, BlockDim, SHMEM_SZ, stream>>>(
+        Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K);
+}
+
+/*
+ *
+ */
+cudaError_t fp6_linear_kernel(cudaStream_t stream,
+                              const uint4* Weight1,
+                              const uint4* Weight2,
+                              const half* Scales,
+                              const half* B,
+                              half* C,
+                              const size_t M_Global,
+                              const size_t N_Global,
+                              const size_t K_Global,
+                              float* Reduction_Workspace,  // Reduction_Workspace_Size = Split_K *
+                                                           // M_Global * N_Global * sizeof(fp32)
+                              int Split_K)
+{
+    assert(M_Global % 256 == 0);
+    assert(K_Global % 64 == 0);
+    assert(N_Global > 0);
+
+    // Work around to support more N shapes:
+    size_t N_PowerOf2;
+    if (N_Global > 0 && N_Global <= 8) N_PowerOf2 = 8;
+    if (N_Global > 8 && N_Global <= 16) N_PowerOf2 = 16;
+    if (N_Global > 16 && N_Global <= 32) N_PowerOf2 = 32;
+    if (N_Global > 32 && N_Global <= 64) N_PowerOf2 = 64;
+    if (N_Global > 64 && N_Global <= 128) N_PowerOf2 = 128;
+    if (N_Global > 128) N_PowerOf2 = ((N_Global - 1) / 128 + 1) * 128;
+
+    if (Split_K == 1) {
+        switch (N_PowerOf2) {
+            case 8:
+                Kernel_Ex<TilingConfig<4, 1, 1>, half>(
+                    stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K);
+                break;
+            case 16:
+                Kernel_Ex<TilingConfig<4, 1, 2>, half>(
+                    stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K);
+                break;
+            case 32:
+                Kernel_Ex<TilingConfig<4, 1, 4>, half>(
+                    stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K);
+                break;
+            case 64:
+                Kernel_Ex<TilingConfig<4, 1, 8>, half>(
+                    stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K);
+                break;
+            case 128:
+                Kernel_Ex<TilingConfig<4, 1, 8>, half>(
+                    stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K);
+                break;
+            default:
+                if (N_PowerOf2 % 128 != 0) {
+                    printf("QuantLLM_API Error: Unsupported N dimension %d!\n", N_PowerOf2);
+                    return cudaErrorUnknown;
+                }
+                Kernel_Ex<TilingConfig<4, 1, 8>, half>(
+                    stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K);
+                break;
+        }
+    } else {
+        switch (N_PowerOf2) {
+            case 8:
+                Kernel_Ex<TilingConfig<4, 1, 1>, float>(stream,
+                                                        Weight1,
+                                                        Weight2,
+                                                        Scales,
+                                                        B,
+                                                        Reduction_Workspace,
+                                                        M_Global,
+                                                        N_Global,
+                                                        K_Global,
+                                                        Split_K);
+                break;
+            case 16:
+                Kernel_Ex<TilingConfig<4, 1, 2>, float>(stream,
+                                                        Weight1,
+                                                        Weight2,
+                                                        Scales,
+                                                        B,
+                                                        Reduction_Workspace,
+                                                        M_Global,
+                                                        N_Global,
+                                                        K_Global,
+                                                        Split_K);
+                break;
+            case 32:
+                Kernel_Ex<TilingConfig<4, 1, 4>, float>(stream,
+                                                        Weight1,
+                                                        Weight2,
+                                                        Scales,
+                                                        B,
+                                                        Reduction_Workspace,
+                                                        M_Global,
+                                                        N_Global,
+                                                        K_Global,
+                                                        Split_K);
+                break;
+            case 64:
+                Kernel_Ex<TilingConfig<4, 1, 8>, float>(stream,
+                                                        Weight1,
+                                                        Weight2,
+                                                        Scales,
+                                                        B,
+                                                        Reduction_Workspace,
+                                                        M_Global,
+                                                        N_Global,
+                                                        K_Global,
+                                                        Split_K);
+                break;
+            case 128:
+                Kernel_Ex<TilingConfig<4, 1, 8>, float>(stream,
+                                                        Weight1,
+                                                        Weight2,
+                                                        Scales,
+                                                        B,
+                                                        Reduction_Workspace,
+                                                        M_Global,
+                                                        N_Global,
+                                                        K_Global,
+                                                        Split_K);
+                break;
+            default:
+                if (N_PowerOf2 % 128 != 0) {
+                    printf("QuantLLM_API Error: Unsupported N dimension %d!\n", N_PowerOf2);
+                    return cudaErrorUnknown;
+                }
+                Kernel_Ex<TilingConfig<4, 1, 8>, float>(stream,
+                                                        Weight1,
+                                                        Weight2,
+                                                        Scales,
+                                                        B,
+                                                        Reduction_Workspace,
+                                                        M_Global,
+                                                        N_Global,
+                                                        K_Global,
+                                                        Split_K);
+                break;
+        }
+        // Reduction for SplitK
+        dim3 GridDim((M_Global * N_Global) / REDUCTION_ELEMENT_PER_THREADBLOCK, 1, 1);
+        dim3 BlockDim(WARP_SIZE, 1, 1);
+        SplitK_Reduction<<<GridDim, BlockDim, 0, stream>>>(
+            C, Reduction_Workspace, M_Global, N_Global, Split_K);
+    }
+    return cudaGetLastError();
+}
+
+/*
+Computes FP6-FP16 GEMM (PyTorch interface).
+
+[Mathematical Formula]
+Standard definition of linear layer:    Out = In * trans(W), where In, Out, and W are stored in
+row-major. After Equivalent transformation    :    trans(Out) = W * trans(In). Note that we do not
+perform "transpose" during runtime, we instead interpret the In/Out as column-major matrices when
+calling our CUDA kernel.
+
+[Inputs]
+  _in_feats:  tensor of shape [B, IC];                  // half
+  _weights:   int tensor of shape [OC, IC // 16 * 3];   // 3 INT32 words contains 16 FP6 weights.
+  _scales:    tensor of shape [OC];                     // half
+  splitK:     splitting the MatMul problem along K dimension for higher GPU utilization, default 1.
+[Outputs]
+  _out_feats: tensor of shape [B, OC];                  // half
+*/
+torch::Tensor fp6_linear_forward_cuda(torch::Tensor _in_feats,
+                                      torch::Tensor _weights,
+                                      torch::Tensor _scales,
+                                      int splitK = 1)
+{
+    int num_in_feats = _in_feats.size(0);
+    int num_in_channels = _in_feats.size(1);
+    int num_out_channels = _weights.size(0);
+    assert(num_in_channels % 64 == 0);
+    assert((num_in_channels / 16 * 3) ==
+           _weights.size(1));  // Making sure the K dimension is matched.
+    //
+    int M = num_out_channels;
+    int K = num_in_channels;
+    int N = num_in_feats;
+    // Input Tensors
+    auto weight1 = reinterpret_cast<const uint4*>(
+        _weights.data_ptr<int>());  // weights is [OC, IC] but in FP6.
+    auto weight2 = weight1 + num_in_channels * num_out_channels * 2 / 128;
+    auto in_feats = reinterpret_cast<const half*>(_in_feats.data_ptr<at::Half>());
+    auto scales = reinterpret_cast<const half*>(_scales.data_ptr<at::Half>());
+    // Output Tensors
+    auto options = torch::TensorOptions().dtype(_in_feats.dtype()).device(_in_feats.device());
+    at::Tensor _out_feats = torch::empty({num_in_feats, num_out_channels}, options);
+    auto out_feats = reinterpret_cast<half*>(_out_feats.data_ptr<at::Half>());
+
+    float* Reduction_Workspace = nullptr;
+    if (splitK != 1) {
+        auto options = torch::TensorOptions().dtype(torch::kFloat32).device(_in_feats.device());
+        at::Tensor _workspace = torch::empty({splitK, num_in_feats, num_out_channels}, options);
+        auto Reduction_Workspace = reinterpret_cast<float*>(
+            _out_feats.data_ptr<float>());  // Reduction_Workspace_Size = Split_K * M_Global *
+                                            // N_Global * sizeof(fp32)
+    }
+
+    fp6_linear_kernel(0,  // Using default stream here.
+                      weight1,
+                      weight2,
+                      scales,
+                      in_feats,
+                      out_feats,
+                      M,
+                      N,
+                      K,
+                      Reduction_Workspace,
+                      splitK);
+
+    return _out_feats;
+}
+
+/*
+ * Inputs:
+ * (1) unsigned char Weight_6bit [M*K*6/8]
+ * Outputs:
+ * (1) unsigned char Weight_2bit [M*K*2/8]
+ * (2) unsigned char Weight_4bit [M*K*4/8]
+ *
+ * Assumption: Weight_6bit, Weight_2bit, Weight_4bit all stored continuously in row-major.
+ * 8 FP6 = 6 Bytes
+ * 8 FP4 = 4 Bytes
+ * 8 FP2 = 2 Bytes
+ */
+
+/*
+ * Weight prepacking (Pytorch interface).
+ * [Input & Output]
+ *  fp6_tensor: int tensor of shape [OC, IC // 16 * 3];   // 3 INT32 words contains 16 FP6 weights.
+ * [Output]
+ *  packed_tensor: int tensor of shape [OC, IC // 16 * 3];
+ */
+torch::Tensor weight_matrix_prepacking_cpu(torch::Tensor fp6_tensor, size_t OC, size_t IC)
+{
+    assert((OC % 256 == 0) && (IC % 64 == 0));
+    assert((fp6_tensor.size(0) == OC) && (fp6_tensor.size(1) == IC / 16 * 3));
+    // auto packed_tensor = torch::empty_like(fp6_tensor);
+    // auto packed_tensor_ptr = reinterpret_cast<int*>(packed_tensor.data_ptr<int>());
+    auto fp6_tensor_ptr = reinterpret_cast<int*>(fp6_tensor.data_ptr<int>());
+    weight_matrix_prepacking(fp6_tensor_ptr, OC, IC);
+    return fp6_tensor;
+}
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cuh
new file mode 100644
index 000000000000..95f7f6050c15
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cuh
@@ -0,0 +1,46 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <torch/extension.h>
+
+/*
+ * Computes FP6-FP16 GEMM (C++ interface).
+ */
+cudaError_t fp6_linear_kernel(cudaStream_t stream,
+                              const uint4* Weight1,
+                              const uint4* Weight2,
+                              const half* Scales,
+                              const half* B,
+                              half* C,
+                              const size_t M_Global,
+                              const size_t N_Global,
+                              const size_t K_Global,
+                              float* Reduction_Workspace,  // Reduction_Workspace_Size = Split_K *
+                                                           // M_Global * N_Global * sizeof(fp32)
+                              int Split_K);
+
+/*
+ * Computes FP6-FP16 GEMM (PyTorch interface).
+ */
+torch::Tensor fp6_linear_forward_cuda(torch::Tensor _in_feats,
+                                      torch::Tensor _weights,
+                                      torch::Tensor _scales,
+                                      int splitK = 1);
+
+/*
+ * In-place weight prepacking (C++ interface).
+ */
+void weight_matrix_prepacking(int* FP6Weights, size_t M, size_t K);
+
+/*
+ * Weight prepacking (Pytorch interface).
+ */
+torch::Tensor weight_matrix_prepacking_cpu(torch::Tensor fp6_tensor, size_t M, size_t K);
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/configs.h b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/configs.h
new file mode 100644
index 000000000000..76e8eda2d35e
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/configs.h
@@ -0,0 +1,96 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#ifndef CONFIGS_H
+#define CONFIGS_H
+
+// #define DEBUG_MODE
+#define PIPELINE_LEVEL_GMEM 2
+#define PIPELINE_LEVEL_SMEM 2  // only support 2
+
+/************************ Hardware Parameters ************************/
+#define WARP_SIZE 32
+#define REG_BIT_WIDTH 32
+// mma: M=16 K=16 N=8
+#define MMA_8 8
+#define MMA_16 16
+// for memory access
+#define THREAD_OPT_ACCESS_BIT_WIDTH_128 128  // LDS.128, cp_async.128, ...
+#define BIT_WIDTH_PER_HALF 16                // Half precision: FP16
+
+/******************** Register Allocation For GEMM ********************/
+#define REG_PER_THREAD_C_TENSOR_16_16 8  // 8 for FP32 Accumulation
+/********************** Memory Padding Parameters **********************/
+// Eliminating bank-conflict
+#define PADDING_BYTES_16 16  // Padding 16 bytes each column
+#define PADDING_SHARED_MEM_FOR_B_8 \
+    8  // Padding 8 half  each column, during CopyFromGlobalToShared() for B
+#define PADDING_SHARED_MEM_FOR_C_4 \
+    4  // Padding 4 float each column, during StoreToSharedMemoryFromRegister() for C
+/************************* WARP Tiling part-1 *************************/
+#define WARP_ROW_MMA_TENSORS 4
+#define WARP_M (WARP_ROW_MMA_TENSORS * MMA_16)  // 64
+#define WARP_K_MMA_TENSORS 4
+#define WARP_K (WARP_K_MMA_TENSORS * MMA_16)  // 64
+template <int BLOCK_ROW_WARPS_, int BLOCK_COL_WARPS_, int WARP_COL_MMA_TENSORS_>
+struct TilingConfig {
+    // Depending on "n" dimension of the GEMM
+    static constexpr int BLOCK_ROW_WARPS = BLOCK_ROW_WARPS_;
+    static constexpr int BLOCK_COL_WARPS = BLOCK_COL_WARPS_;
+    static constexpr int WARP_COL_MMA_TENSORS = WARP_COL_MMA_TENSORS_;
+    /************************* WARP Tiling part-2 *************************/
+    static constexpr int WARP_N = WARP_COL_MMA_TENSORS * MMA_8;
+    /*************************Thread Block Tiling *************************/
+    static constexpr int TILE_M = WARP_M * BLOCK_ROW_WARPS;
+    static constexpr int TILE_N = MMA_8 * WARP_COL_MMA_TENSORS * BLOCK_COL_WARPS;
+    static constexpr int TILE_K = WARP_K;
+    /********************** #Thread per Thread Block **********************/
+    static constexpr int BLOCK_WARPS = BLOCK_ROW_WARPS * BLOCK_COL_WARPS;
+    static constexpr int BLOCK_THREADS = BLOCK_WARPS * WARP_SIZE;
+    /******************************* Others *******************************/
+    static constexpr int SMEM_SIZE_B_TILE = TILE_N * (TILE_K + PADDING_BYTES_16) * 2 *
+                                            PIPELINE_LEVEL_GMEM;  // sizeof(half)=2, doubleBuffer=2
+    static constexpr int SMEM_SIZE_C_TILE =
+        TILE_N * (TILE_M + PADDING_BYTES_16) * 4;  // sizeof(float)=4
+};
+
+/************************ General Config for Quant-LLM **********************/
+#define WEIGHT_FRAG1_BIT_WIDTH 2
+#define WEIGHT_FRAG2_BIT_WIDTH 4
+#define WEIGHT_BIT_WIDTH (WEIGHT_FRAG1_BIT_WIDTH + WEIGHT_FRAG2_BIT_WIDTH)  // 6
+// #define QUANT_GROUP_SIZE_DIVIDED_BY_64  4                                                   //
+//  QuantGroupSize: 4*64 = 256
+/*************************** 64*64 Weghts of A WARP *************************/
+#define WEIGHT_PER_UNIT (WARP_M * WARP_K)  // 64*64
+#define SMEM_SIZE_IN_BYTES_PER_WARP_A1          \
+    (WEIGHT_PER_UNIT * WEIGHT_FRAG1_BIT_WIDTH / \
+     8)  // 1024 Bytes   #doubleBuffer not takedn into consideration
+#define SMEM_SIZE_IN_BYTES_PER_WARP_A2          \
+    (WEIGHT_PER_UNIT * WEIGHT_FRAG2_BIT_WIDTH / \
+     8)  // 2048 Bytes   #doubleBuffer not takedn into consideration
+#define SMEM_SIZE_A1_TILE                 \
+    (SMEM_SIZE_IN_BYTES_PER_WARP_A1 * 4 * \
+     PIPELINE_LEVEL_GMEM)  // #WARP=4, #Trible-Buffer for 3-level pipeline for A = 12 KB; double
+                           // buffer for 2-level pipeline A= 8  KB.
+#define SMEM_SIZE_A2_TILE                 \
+    (SMEM_SIZE_IN_BYTES_PER_WARP_A2 * 4 * \
+     PIPELINE_LEVEL_GMEM)  // #WARP=4, #Trible-Buffer for 3-level pipeline for A = 24 KB; double
+                           // buffer for 2-level pipeline A= 16 KB.
+/******************** Global Memory Layout For QUANTIZED DATA ******************/
+#define NUM_INT4_PER_UNIT_2BIT_FRAG (WEIGHT_PER_UNIT * WEIGHT_FRAG1_BIT_WIDTH / 128)  // 64
+#define NUM_INT4_PER_UNIT_4BIT_FRAG (WEIGHT_PER_UNIT * WEIGHT_FRAG2_BIT_WIDTH / 128)  // 128
+/******************** Register Allocation For QUANTIZED DATA ******************/
+#define WEIGHT_PER_THREAD (WEIGHT_PER_UNIT / WARP_SIZE)                   // 128
+#define REG_PER_THREAD_2BIT_FRAG (WEIGHT_PER_THREAD / REG_BIT_WIDTH * 2)  // 8
+#define REG_PER_THREAD_4BIT_FRAG (WEIGHT_PER_THREAD / REG_BIT_WIDTH * 4)  // 16
+/******************** Register Allocation For QUANT Scales ******************/
+#define WARP_REG_QUANT_SCALE 4  // 8 rows per thread -> 8 FP16 scales -> 4 registers
+#define WARP_REG_QUANT_SCALE_DISTRIBUTED \
+    1  // T0-T3, T4-T7, ..., T28-T31 share the same scales, using shfl to get all the scales for
+       // each thread
+
+#endif  // CONFIGS_H
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh
new file mode 100644
index 000000000000..aa6ea6c4b1c2
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh
@@ -0,0 +1,261 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#include "configs.h"
+#include "utils_core.cuh"
+#include "utils_gmem.cuh"
+
+/*
+ * C = A*B
+ * A: row major with ahead-of-time layout transformation, FP6
+ * B: col major, FP16
+ * C: col major, FP16
+ */
+template <typename TilingConfig, typename OutputDataType>
+__global__ void QUANT_GEMM_Kernel(const uint4* Weight1,
+                                  const uint4* Weight2,
+                                  const half* Scales,
+                                  const half* B,
+                                  OutputDataType* C,
+                                  const size_t M_Global,
+                                  const size_t N_Global,
+                                  const size_t K_Global,
+                                  int Split_K)
+{
+#ifdef DEBUG_MODE
+    assert(K_Global % TilingConfig::TILE_K == 0);
+    assert(M_Global % TilingConfig::TILE_M == 0);
+    assert(gridDim.y == Split_K * (M_Global / TilingConfig::TILE_M));
+#endif
+    extern __shared__ __align__(128)
+        half smem[];  // Dynamic shared memory for FP16 A tiles， 128 Bytes aligned
+    half(*smem_array)[WARP_K + PADDING_SHARED_MEM_FOR_B_8] =
+        reinterpret_cast<half(*)[WARP_K + PADDING_SHARED_MEM_FOR_B_8]>(
+            smem +
+            (SMEM_SIZE_A1_TILE + SMEM_SIZE_A2_TILE) / 2);  // Dynamic shared memory for FP16 B tiles
+    __shared__ half QuantScales[64 * TilingConfig::BLOCK_WARPS];  // static shared memory for
+                                                                  // quantization scales, 64 row per
+                                                                  // warp * 4 warps = 512 Bytes
+    // Thread Block Mapping, considering SplitK
+    const size_t BatchID = blockIdx.y / (M_Global / TilingConfig::TILE_M);
+    const size_t x = blockIdx.x;  // Output Block ID: (BlockID_Row = y; BlockID_Col = x )
+    const size_t y =
+        blockIdx.y %
+        (M_Global / TilingConfig::TILE_M);  // Output Block ID: (BlockID_Row = y; BlockID_Col = x )
+    const size_t Tile_Start_M = y * TilingConfig::TILE_M;
+    const size_t Tile_Start_N = x * TilingConfig::TILE_N;
+    const size_t NumColumnToCopy = (N_Global - Tile_Start_N) < TilingConfig::TILE_N
+                                       ? (N_Global - Tile_Start_N)
+                                       : TilingConfig::TILE_N;
+    const size_t NumBlock_K = K_Global / TilingConfig::TILE_K;
+    const size_t AverageNumBlock_K = NumBlock_K / Split_K;
+    const size_t ExtraNumBlock_K = NumBlock_K - AverageNumBlock_K * Split_K;
+    size_t NumIter = AverageNumBlock_K;
+    if (BatchID < ExtraNumBlock_K) NumIter++;
+    size_t StartBlockID_K = AverageNumBlock_K * BatchID;
+    if (BatchID < ExtraNumBlock_K)
+        StartBlockID_K += BatchID;
+    else
+        StartBlockID_K += ExtraNumBlock_K;
+    // Warp ID.
+    const int warpId = threadIdx.x / WARP_SIZE;
+    int WARP_i =
+        warpId / TilingConfig::BLOCK_COL_WARPS;  // WARP_i: row number;  WARP_j: column number
+    // int WARP_j = warpId % TilingConfig::BLOCK_COL_WARPS;
+    // Global Memory Address for Matrix A (Weight)
+    // ///////////////////////////////////////////////////////////////////////// StartPTR for each
+    // ThreadBlock(TB)
+    const uint4* TB_StartGPTR_A1 =
+        Weight1 + (y * TilingConfig::BLOCK_ROW_WARPS) * NumBlock_K * NUM_INT4_PER_UNIT_2BIT_FRAG;
+    const uint4* TB_StartGPTR_A2 =
+        Weight2 + (y * TilingConfig::BLOCK_ROW_WARPS) * NumBlock_K * NUM_INT4_PER_UNIT_4BIT_FRAG;
+    // StartPTR for each WARP.
+    const uint4* WARP_StartGPTR_A1 =
+        TB_StartGPTR_A1 + WARP_i * NumBlock_K * NUM_INT4_PER_UNIT_2BIT_FRAG;
+    const uint4* WARP_StartGPTR_A2 =
+        TB_StartGPTR_A2 + WARP_i * NumBlock_K * NUM_INT4_PER_UNIT_4BIT_FRAG;
+    // StartPTR for each WARP, considering SplitK
+    const size_t WARP_Start_UnitID_K = StartBlockID_K;
+    WARP_StartGPTR_A1 += WARP_Start_UnitID_K * NUM_INT4_PER_UNIT_2BIT_FRAG;
+    WARP_StartGPTR_A2 += WARP_Start_UnitID_K * NUM_INT4_PER_UNIT_4BIT_FRAG;
+    // Copying A tile from Global to Shared, using double-buffer
+    // ////////////////////////////////////////////////////////// StartSPTR for each ThreadBlock
+    uint32_t* AFrag_2BIT_SPTR = reinterpret_cast<uint32_t*>(smem);
+    uint32_t* AFrag_4BIT_SPTR =
+        AFrag_2BIT_SPTR +
+        SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4 * TilingConfig::BLOCK_WARPS *
+            PIPELINE_LEVEL_GMEM;  // 8 buffers including double buffers, 12 for trible buffers
+    // StartSPTR for each WARP
+    AFrag_2BIT_SPTR += warpId * SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4;
+    AFrag_4BIT_SPTR += warpId * SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 4;
+    // Pre-fetch of A tile
+    for (int i = 0; i < PIPELINE_LEVEL_GMEM - 1; i++) {
+        CopyFromGlobalToShared_A<SMEM_SIZE_IN_BYTES_PER_WARP_A1>(
+            AFrag_2BIT_SPTR + i * SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4 * 4, WARP_StartGPTR_A1);
+        CopyFromGlobalToShared_A<SMEM_SIZE_IN_BYTES_PER_WARP_A2>(
+            AFrag_4BIT_SPTR + i * SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 4 * 4, WARP_StartGPTR_A2);
+        WARP_StartGPTR_A1 += SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 16;
+        WARP_StartGPTR_A2 += SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 16;
+    }
+    // Global Memory Address for Matrix A (QuantScale)
+    // /////////////////////////////////////////////////////////////////////
+    const half* TB_StartGPTR_A_Scale = Scales + (y * TilingConfig::BLOCK_ROW_WARPS) * 64;
+    const half* WARP_StartGPTR_A_Scales = TB_StartGPTR_A_Scale + WARP_i * 64;
+    CopyFromGlobalToShared_Scales(QuantScales + WARP_i * 64, WARP_StartGPTR_A_Scales);
+    // Copying B tile from Global to Shared, considering SplitK
+    // /////////////////////////////////////////////////////////////
+    const half* BTile_GPTR = B + Tile_Start_N * K_Global + StartBlockID_K * TilingConfig::TILE_K;
+    for (int i = 0; i < PIPELINE_LEVEL_GMEM - 1; i++) {
+        CopyFromGlobalToShared<TilingConfig::TILE_N, TilingConfig::BLOCK_WARPS>(
+            smem_array + i * TilingConfig::TILE_N, BTile_GPTR, K_Global, NumColumnToCopy);
+        BTile_GPTR += TilingConfig::TILE_K;
+    }
+    // Register Allocation for A,B, and C, Initilazed to Zeros
+    // /////////////////////////////////////////////////////////////////////
+    constexpr int NumRegSets_a =
+        WARP_ROW_MMA_TENSORS;  // 1 set = 4 registers, containing a 16*16 MMA block
+    constexpr int NumRegSets_b = (TilingConfig::WARP_COL_MMA_TENSORS == 1)
+                                     ? 1
+                                     : TilingConfig::WARP_COL_MMA_TENSORS /
+                                           2;  // 1 set = 4 registers, containing a 16*16 MMA block
+#ifdef PIPELINE_LEVEL_SMEM
+    uint32_t a[NumRegSets_a * PIPELINE_LEVEL_SMEM]
+              [4];  // double/Trible buffer is used // Registers to store decompressed FP6
+    uint32_t b[NumRegSets_b * PIPELINE_LEVEL_SMEM]
+              [4];  // double/Triple buffer is used // Register to store FP16 B matrix (a slice)
+#endif
+    float c[NumRegSets_a * NumRegSets_b][REG_PER_THREAD_C_TENSOR_16_16];
+    for (int i = 0; i < NumRegSets_a * NumRegSets_b; i++)
+        for (int j = 0; j < REG_PER_THREAD_C_TENSOR_16_16; j++) c[i][j] = 0.0f;
+    //
+    cp_async_wait_all();
+    __syncthreads();
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    uint32_t Scales_RPTR[4];  // 4 Registers per thread for Quantization Scales
+    ExtractFromSharedToReg_Scales(Scales_RPTR, QuantScales + WARP_i * 64);
+#ifdef PIPELINE_LEVEL_SMEM
+    // Initializing the Software Pipeline: writing registers.
+    // ////////////////////////////////////////////////////////////////////////////////////////////////
+    initialize_mma_slice<TilingConfig>(
+        a, b, AFrag_2BIT_SPTR, AFrag_4BIT_SPTR, smem_array, Scales_RPTR);
+#endif
+// The outer loop.
+// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma unroll(1)
+    for (size_t tile_id_k = 0; tile_id_k < NumIter; tile_id_k++) {
+        // Trible-Buffer for A Tile
+        uint32_t* __restrict__ read_SPTR_Frag1 =
+            AFrag_2BIT_SPTR + ((tile_id_k + 0) % PIPELINE_LEVEL_GMEM) *
+                                  SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4 *
+                                  4;  // 1024 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
+        uint32_t* __restrict__ read_SPTR_Frag2 =
+            AFrag_4BIT_SPTR + ((tile_id_k + 0) % PIPELINE_LEVEL_GMEM) *
+                                  SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 4 *
+                                  4;  // 2048 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
+#ifdef PIPELINE_LEVEL_SMEM
+        uint32_t* __restrict__ read2_SPTR_Frag1 =
+            AFrag_2BIT_SPTR +
+            ((tile_id_k + 1) % PIPELINE_LEVEL_GMEM) * SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4 * 4;
+        uint32_t* __restrict__ read2_SPTR_Frag2 =
+            AFrag_4BIT_SPTR +
+            ((tile_id_k + 1) % PIPELINE_LEVEL_GMEM) * SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 4 * 4;
+#endif
+        uint32_t* __restrict__ write_SPTR_Frag1 =
+            AFrag_2BIT_SPTR + ((tile_id_k + (PIPELINE_LEVEL_GMEM - 1)) % PIPELINE_LEVEL_GMEM) *
+                                  SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4 *
+                                  4;  // 1024 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
+        uint32_t* __restrict__ write_SPTR_Frag2 =
+            AFrag_4BIT_SPTR + ((tile_id_k + (PIPELINE_LEVEL_GMEM - 1)) % PIPELINE_LEVEL_GMEM) *
+                                  SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 4 *
+                                  4;  // 2048 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
+        // Trible-Buffer for B Tile
+        half __restrict__(*read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8] =
+            smem_array + ((tile_id_k + 0) % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N;
+#ifdef PIPELINE_LEVEL_SMEM
+        half __restrict__(*read2_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8] =
+            smem_array + ((tile_id_k + 1) % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N;
+#endif
+        half __restrict__(*write_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8] =
+            smem_array +
+            ((tile_id_k + (PIPELINE_LEVEL_GMEM - 1)) % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N;
+        //
+        bool GlobalCopy = (tile_id_k + PIPELINE_LEVEL_GMEM - 1) < NumIter;
+        // Copying A tile from Global to Register, Bypassing L1, using double-buffer
+        CopyFromGlobalToShared_A<SMEM_SIZE_IN_BYTES_PER_WARP_A1>(
+            write_SPTR_Frag1, WARP_StartGPTR_A1, GlobalCopy);
+        CopyFromGlobalToShared_A<SMEM_SIZE_IN_BYTES_PER_WARP_A2>(
+            write_SPTR_Frag2, WARP_StartGPTR_A2, GlobalCopy);
+        // copying B tile from GlobalMemory to SharedMemory
+        CopyFromGlobalToShared<TilingConfig::TILE_N, TilingConfig::BLOCK_WARPS>(
+            write_SPTR, BTile_GPTR, K_Global, NumColumnToCopy, GlobalCopy);
+        cp_async_group_commit();
+#ifdef PIPELINE_LEVEL_SMEM
+        core_mma_slice<TilingConfig>(c,
+                                     a,
+                                     b,
+                                     read_SPTR_Frag1,
+                                     read_SPTR_Frag2,
+                                     read_SPTR,
+                                     Scales_RPTR,
+                                     1);  // read_SPTR_Frag1, read_SPTR_Frag2 are different for each
+                                          // WARP; read_SPTR is shared among WARPs
+        core_mma_slice<TilingConfig>(
+            c, a, b, read_SPTR_Frag1, read_SPTR_Frag2, read_SPTR, Scales_RPTR, 2);
+        core_mma_slice<TilingConfig>(
+            c, a, b, read_SPTR_Frag1, read_SPTR_Frag2, read_SPTR, Scales_RPTR, 3);
+        // Barriers and Synchronizations
+        cp_async_wait_group<PIPELINE_LEVEL_GMEM - 2>();
+        __syncthreads();
+        core_mma_slice<TilingConfig>(
+            c, a, b, read2_SPTR_Frag1, read2_SPTR_Frag2, read2_SPTR, Scales_RPTR, 0);
+        // Updating global PTRs
+        WARP_StartGPTR_A1 +=
+            SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 16;  // 4KB/16=256 (1)/16: int4*+1 = char*+16
+        WARP_StartGPTR_A2 +=
+            SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 16;  // 8KB/16=512 (1)/16: int4*+1 = char*+16
+        BTile_GPTR += TilingConfig::TILE_K;
+#else
+        PipelinedCoreLoop<TilingConfig>(
+            c,
+            read_SPTR,
+            read_SPTR_Frag1,
+            read_SPTR_Frag2,
+            Scales_RPTR);  // read_SPTR_Frag1, read_SPTR_Frag2 are different for each WARP;
+                           // read_SPTR is shared among WARPs
+        // Updating global PTRs
+        WARP_StartGPTR_A1 +=
+            SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 16;  // 4KB/16=256 (1)/16: int4*+1 = char*+16
+        WARP_StartGPTR_A2 +=
+            SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 16;  // 8KB/16=512 (1)/16: int4*+1 = char*+16
+        BTile_GPTR += TilingConfig::TILE_K;
+        // Barriers and Synchronizations
+        cp_async_wait_group<PIPELINE_LEVEL_GMEM - 2>();
+        __syncthreads();
+#endif
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Store the C fragments to shared memory.
+    float(*smem_CFrag)[TilingConfig::TILE_M + PADDING_SHARED_MEM_FOR_C_4] =
+        reinterpret_cast<float(*)[TilingConfig::TILE_M + PADDING_SHARED_MEM_FOR_C_4]>(smem);
+    StoreToSharedMemoryFromRegister<TilingConfig>(smem_CFrag, c);
+    __syncthreads();
+    // Now that shared memory contains all the D tiles, stream them to global memory.
+    OutputDataType* BlockGlobalPTR =
+        C + BatchID * (M_Global * N_Global) + Tile_Start_M + Tile_Start_N * M_Global;
+    for (size_t i = warpId; i < NumColumnToCopy; i += TilingConfig::BLOCK_WARPS)  // i-th column
+#pragma unroll
+        for (size_t j = threadIdx.x % WARP_SIZE; j < TilingConfig::TILE_M;
+             j += WARP_SIZE)  // j-th row
+        {
+            if constexpr (std::is_same<OutputDataType, half>::value)
+                BlockGlobalPTR[j + i * M_Global] = __float2half_rn(smem_CFrag[i][j]);
+            else
+                BlockGlobalPTR[j + i * M_Global] = smem_CFrag[i][j];
+        }
+}
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh
new file mode 100644
index 000000000000..8c49f8b0b3a5
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#define REDUCTION_ELEMENT_PER_THREADBLOCK 256
+#define HALF_PER_128BIT 8
+
+__global__ void SplitK_Reduction(half* C,
+                                 float* Reduction_Workspace,
+                                 size_t M_Global,
+                                 size_t N_Global,
+                                 int Split_K)
+{
+    half* WARP_GPTR_C = C + REDUCTION_ELEMENT_PER_THREADBLOCK * blockIdx.x;
+    float* WARP_GPTR_R = Reduction_Workspace + REDUCTION_ELEMENT_PER_THREADBLOCK * blockIdx.x;
+    half* THREAD_GPTR_C = WARP_GPTR_C + threadIdx.x * HALF_PER_128BIT;
+    float* THREAD_GPTR_R = WARP_GPTR_R + threadIdx.x * HALF_PER_128BIT;
+    // Initializing Thread-Local Results
+    float Results[HALF_PER_128BIT];
+#pragma unroll
+    for (int i = 0; i < HALF_PER_128BIT; i++) Results[i] = 0.0f;
+    // Reduction
+    for (int i = 0; i < Split_K; i++) {
+#pragma unroll
+        for (int j = 0; j < HALF_PER_128BIT; j++) Results[j] += THREAD_GPTR_R[j];
+        THREAD_GPTR_R += M_Global * N_Global;
+    }
+// Writing to global memory
+#pragma unroll
+    for (int i = 0; i < HALF_PER_128BIT; i++) THREAD_GPTR_C[i] = __float2half_rn(Results[i]);
+}
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh
new file mode 100644
index 000000000000..7f36cfd5d961
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh
@@ -0,0 +1,52 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#ifndef PTX_CP_ASYNC_CUH
+#define PTX_CP_ASYNC_CUH
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+template <int SizeInBytes>
+__device__ __forceinline__ void cp_async(half* smem_ptr,
+                                         const half* global_ptr,
+                                         bool pred_guard = true)
+{
+    static_assert(SizeInBytes == 16, "Size is not supported");
+    unsigned smem_int_ptr = __cvta_generic_to_shared(smem_ptr);
+    asm volatile(
+        "{ \n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %0, 0;\n"
+        "  @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+        "}\n" ::"r"((int)pred_guard),
+        "r"(smem_int_ptr),
+        "l"(global_ptr),
+        "n"(SizeInBytes));
+}
+
+/// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block.
+__device__ __forceinline__ void cp_async_group_commit()
+{
+    asm volatile("cp.async.commit_group;\n" ::);
+}
+
+/// Blocks until all but <N> previous cp.async.commit_group operations have committed.
+template <int N>
+__device__ __forceinline__ void cp_async_wait_group()
+{
+    asm volatile("cp.async.wait_group %0;\n" ::"n"(N));
+}
+
+/// Blocks until all previous cp.async.commit_group operations have committed.
+// cp.async.wait_all is equivalent to :
+// cp.async.commit_group;
+// cp.async.wait_group 0;
+__device__ __forceinline__ void cp_async_wait_all() { asm volatile("cp.async.wait_all;\n" ::); }
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh
new file mode 100644
index 000000000000..f13abe036279
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh
@@ -0,0 +1,125 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#ifndef PTX_MMA_CUH
+#define PTX_MMA_CUH
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <assert.h>
+#include "configs.h"
+
+#ifdef PIPELINE_LEVEL_SMEM
+template <typename TilingConfig>
+__device__ __forceinline__ void B_FromSharedToReg(
+    uint32_t __restrict__ Reg[][4],
+    half __restrict__ (*read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
+    int slice_id)
+{
+#ifdef DEBUG_MODE
+    static_assert((TilingConfig::WARP_COL_MMA_TENSORS == 1) ||
+                  (TilingConfig::WARP_COL_MMA_TENSORS % 2 == 0));
+#endif
+
+    const int warpId = threadIdx.x / WARP_SIZE;
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int WARP_j = warpId % TilingConfig::BLOCK_COL_WARPS;
+    int warp_start_col = TilingConfig::WARP_COL_MMA_TENSORS * MMA_8 *
+                         WARP_j;  // each warp may start from reading warp_start_col'th column of
+                                  // the B tile in shared memory
+#ifdef DEBUG_MODE
+    assert(warp_start_col == 0);
+#endif
+
+    int col = (lane_id % 8) + (lane_id / 16) * 8;
+    int row = (lane_id % 16) / 8 * 8;
+    uint32_t smem_local_ptr = static_cast<uint32_t>(
+        __cvta_generic_to_shared(&read_SPTR[warp_start_col + col][slice_id * MMA_16 + row]));
+    if (TilingConfig::WARP_COL_MMA_TENSORS == 1) {
+        asm volatile("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];\n"
+                     : "=r"(Reg[0][0]), "=r"(Reg[0][1])
+                     : "r"(smem_local_ptr));
+    } else {
+#pragma unroll
+        for (int i = 0; i < TilingConfig::WARP_COL_MMA_TENSORS / 2; i++) {
+            asm volatile("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+                         : "=r"(Reg[i][0]), "=r"(Reg[i][1]), "=r"(Reg[i][2]), "=r"(Reg[i][3])
+                         : "r"(smem_local_ptr));
+            smem_local_ptr += 16 * (WARP_K + PADDING_SHARED_MEM_FOR_B_8) * sizeof(half);
+        }
+    }
+}
+#else
+// Debug: Whether ldmatrix.trans is required???
+// B is in column-major
+template <typename TilingConfig>
+__device__ __forceinline__ void B_FromSharedToReg(
+    uint32_t __restrict__ Reg[][4],
+    half __restrict__ (*read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
+    int k_offset)
+{
+#ifdef DEBUG_MODE
+    static_assert((TilingConfig::WARP_COL_MMA_TENSORS == 1) ||
+                  (TilingConfig::WARP_COL_MMA_TENSORS % 2 == 0));
+#endif
+
+    const int warpId = threadIdx.x / WARP_SIZE;
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int WARP_j = warpId % TilingConfig::BLOCK_COL_WARPS;
+    int warp_start_col = TilingConfig::WARP_COL_MMA_TENSORS * MMA_8 *
+                         WARP_j;  // each warp may start from reading warp_start_col'th column of
+                                  // the B tile in shared memory
+#ifdef DEBUG_MODE
+    assert(warp_start_col == 0);
+#endif
+
+    int col = (lane_id % 8) + (lane_id / 16) * 8;
+    int row = (lane_id % 16) / 8 * 8;
+    uint32_t smem_local_ptr = static_cast<uint32_t>(
+        __cvta_generic_to_shared(&read_SPTR[warp_start_col + col][k_offset + row]));
+    if (TilingConfig::WARP_COL_MMA_TENSORS == 1) {
+        asm volatile("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];\n"
+                     : "=r"(Reg[0][0]), "=r"(Reg[0][1])
+                     : "r"(smem_local_ptr));
+    } else {
+#pragma unroll
+        for (int i = 0; i < TilingConfig::WARP_COL_MMA_TENSORS / 2; i++) {
+            asm volatile("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+                         : "=r"(Reg[i][0]), "=r"(Reg[i][1]), "=r"(Reg[i][2]), "=r"(Reg[i][3])
+                         : "r"(smem_local_ptr));
+            smem_local_ptr += 16 * (WARP_K + PADDING_SHARED_MEM_FOR_B_8) * sizeof(half);
+        }
+    }
+}
+#endif
+
+__device__ __forceinline__ void MMA_FP16_M16N8K16(uint32_t __restrict__ c[],
+                                                  uint32_t __restrict__* a,
+                                                  uint32_t __restrict__* b)
+{
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+        "{ %0, %1, %2, %3},"
+        "{ %4, %5, %6, %7 },"
+        "{ %8, %9 },"
+        "{ %10, %11, %12, %13 };"
+        : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+        : "r"(a[0]),
+          "r"(a[1]),
+          "r"(a[2]),
+          "r"(a[3]),
+          "r"(b[0]),
+          "r"(b[1]),
+          "r"(c[0]),
+          "r"(c[1]),
+          "r"(c[2]),
+          "r"(c[3]));
+}
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh
new file mode 100644
index 000000000000..713cebc57e33
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh
@@ -0,0 +1,246 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#ifndef UTILS_CORE_CUH
+#define UTILS_CORE_CUH
+
+#include <assert.h>
+
+#include "configs.h"
+#include "ptx_mma.cuh"
+#include "utils_paralleldequant.cuh"
+
+#ifdef PIPELINE_LEVEL_SMEM
+template <int NUM_INT_PER_THREAD>
+__device__ __forceinline__ void CopyFromSharedToRegister_AFrag(uint32_t Reg[],
+                                                               uint32_t* SPTR,
+                                                               int slice_id)
+{
+    SPTR += slice_id * (NUM_INT_PER_THREAD * WARP_SIZE);
+    int lane_id = threadIdx.x % WARP_SIZE;
+#pragma unroll
+    for (int i = 0; i < NUM_INT_PER_THREAD; i++) { Reg[i] = SPTR[lane_id + i * WARP_SIZE]; }
+}
+
+template <typename TilingConfig>
+__device__ __forceinline__ void initialize_mma_slice(
+    uint32_t (*a)[4],
+    uint32_t (*b)[4],
+    uint32_t* __restrict__ A1_SPTR_read,
+    uint32_t* __restrict__ A2_SPTR_read,
+    half __restrict__ (*B_SPTR_read)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
+    uint32_t* RPTR_Scales)
+{
+    // Writing registers
+    // Registers to store FP6 fragments for a slice (64*16) of A matrix => 32 FP6 per thread => 6
+    // register per thread;
+    uint32_t a_1[2];  // NO double buffer
+    uint32_t a_2[4];  // NO double buffer
+    CopyFromSharedToRegister_AFrag<2>(a_1, A1_SPTR_read, 0);
+    CopyFromSharedToRegister_AFrag<4>(a_2, A2_SPTR_read, 0);
+    Dequant_32FP6_4Way(a, a_1, a_2, RPTR_Scales);  // SIMT Dequant: dequantizing FP6 to FP16 at
+                                                   // register level, dequantizing a slice each time
+    B_FromSharedToReg<TilingConfig>(b, B_SPTR_read, 0);  // Loading B from shared to registers
+}
+
+template <typename TilingConfig>
+__device__ __forceinline__ void core_mma_slice(
+    float c[][REG_PER_THREAD_C_TENSOR_16_16],
+    uint32_t (*a)[4],
+    uint32_t (*b)[4],
+    uint32_t* __restrict__ A1_SPTR_read,
+    uint32_t* __restrict__ A2_SPTR_read,
+    half __restrict__ (*B_SPTR_read)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
+    uint32_t* RPTR_Scales,
+    int slice_id)  // writing slice[slice_id] to registers, k=0 -> slice_id=1 for prefetching
+{
+#ifdef DEBUG_MODE
+    assert(
+        (TilingConfig::WARP_COL_MMA_TENSORS == 1) ||
+        (TilingConfig::WARP_COL_MMA_TENSORS % 2 ==
+         0));  // if WARP_COL_MMA_TENSORS == 1, B tile in registers is padded to a 16*16 MMA block
+#endif
+    const int NumRegSets_a =
+        WARP_ROW_MMA_TENSORS;  // 1 set = 4 registers, containing a 16*16 MMA block
+    const int NumRegSets_b = (TilingConfig::WARP_COL_MMA_TENSORS == 1)
+                                 ? 1
+                                 : TilingConfig::WARP_COL_MMA_TENSORS /
+                                       2;  // 1 set = 4 registers, containing a 16*16 MMA block
+    uint32_t(*c_uint_ptr)[REG_PER_THREAD_C_TENSOR_16_16] =
+        reinterpret_cast<uint32_t(*)[REG_PER_THREAD_C_TENSOR_16_16]>(
+            c);  // Registers for accumulated FP32 results
+
+    // Setting RPTRs for double buffers
+    uint32_t(*a_read)[4] = a;
+    uint32_t(*a_write)[4] = a;
+    uint32_t(*b_read)[4] = b;
+    uint32_t(*b_write)[4] = b;
+    if (slice_id % 2 == 1) {
+        b_write += NumRegSets_b;
+        a_write += NumRegSets_a;
+    } else {
+        b_read += NumRegSets_b;
+        a_read += NumRegSets_a;
+    }
+
+// Reading registers and issuing core tensor core computations (a slice of A and B tile in shared
+// memory)
+#pragma unroll
+    for (int i = 0; i < WARP_ROW_MMA_TENSORS; i++) {
+        if (TilingConfig::WARP_COL_MMA_TENSORS == 1) {
+            MMA_FP16_M16N8K16(c_uint_ptr[i], a_read[i], b_read[0]);
+        } else {
+#pragma unroll
+            for (int j = 0; j < TilingConfig::WARP_COL_MMA_TENSORS / 2; j++) {
+                MMA_FP16_M16N8K16(c_uint_ptr[i + j * WARP_ROW_MMA_TENSORS], a_read[i], b_read[j]);
+                MMA_FP16_M16N8K16(c_uint_ptr[i + j * WARP_ROW_MMA_TENSORS] + 4,
+                                  a_read[i],
+                                  b_read[j] + 2);  // c+4; b+2
+            }
+        }
+    }
+
+    // Writing registers
+    // Registers to store FP6 fragments for a slice (64*16) of A matrix => 32 FP6 per thread => 6
+    // register per thread;
+    uint32_t a_1[2];  // NO double buffer
+    uint32_t a_2[4];  // NO double buffer
+    CopyFromSharedToRegister_AFrag<2>(a_1, A1_SPTR_read, slice_id);
+    CopyFromSharedToRegister_AFrag<4>(a_2, A2_SPTR_read, slice_id);
+    Dequant_32FP6_4Way(
+        a_write, a_1, a_2, RPTR_Scales);  // SIMT Dequant: dequantizing FP6 to FP16 at register
+                                          // level, dequantizing a slice each time
+    B_FromSharedToReg<TilingConfig>(
+        b_write, B_SPTR_read, slice_id);  // Loading B from shared to registers
+}
+
+#else
+// Old version with naive pipeline design
+template <int NUM_INT_PER_THREAD>
+__device__ __forceinline__ void CopyFromSharedToRegister_AFrag(uint32_t Reg[], uint32_t* SPTR)
+{
+    int lane_id = threadIdx.x % WARP_SIZE;
+#pragma unroll
+    for (int i = 0; i < NUM_INT_PER_THREAD; i++) { Reg[i] = SPTR[lane_id + i * WARP_SIZE]; }
+}
+template <typename TilingConfig>
+__device__ __forceinline__ void PipelinedCoreLoop(
+    float c[][REG_PER_THREAD_C_TENSOR_16_16],
+    half __restrict__ (*read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
+    uint32_t* __restrict__ read_SPTR_Frag1,
+    uint32_t* __restrict__ read_SPTR_Frag2,
+    uint32_t* RPTR_Scales)
+{
+#ifdef DEBUG_MODE
+    assert(
+        (TilingConfig::WARP_COL_MMA_TENSORS == 1) ||
+        (TilingConfig::WARP_COL_MMA_TENSORS % 2 ==
+         0));  // if WARP_COL_MMA_TENSORS == 1, B tile in registers is padded to a 16*16 MMA block
+#endif
+    const int NumRegSets_a =
+        WARP_ROW_MMA_TENSORS;  // 1 set = 4 registers, containing a 16*16 MMA block
+    const int NumRegSets_b = (TilingConfig::WARP_COL_MMA_TENSORS == 1)
+                                 ? 1
+                                 : TilingConfig::WARP_COL_MMA_TENSORS /
+                                       2;  // 1 set = 4 registers, containing a 16*16 MMA block
+
+    // Registers to store FP32 results
+    uint32_t(*c_uint_ptr)[REG_PER_THREAD_C_TENSOR_16_16] =
+        reinterpret_cast<uint32_t(*)[REG_PER_THREAD_C_TENSOR_16_16]>(c);
+    // Registers to store FP6 fragments for a slice (64*16) of A matrix => 32 FP6 per thread => 6
+    // register per thread;
+    uint32_t a_1[2 * 2];  // double buffer is used
+    uint32_t a_2[4 * 2];  // double buffer is used
+    // Registers to store decompressed FP6
+    uint32_t a[NumRegSets_a * 1][4];  // No double buffer
+    // Register to store FP16 B matrix (a slice)
+    uint32_t b[NumRegSets_b * 2][4];  // double buffer is used
+
+    // Overlapped Smem and TC pipeline: pre-loading from shared to registers
+    CopyFromSharedToRegister_AFrag<2>(a_1, read_SPTR_Frag1);
+    CopyFromSharedToRegister_AFrag<4>(a_2, read_SPTR_Frag2);
+    B_FromSharedToReg<TilingConfig>(b, read_SPTR, 0);
+
+#pragma unroll
+    for (int k = 0; k < WARP_K_MMA_TENSORS; k++) {
+        uint32_t(*b_read)[4] = b;
+        uint32_t(*b_write)[4] = b;
+        uint32_t* a_1_read = a_1;
+        uint32_t* a_1_write = a_1;
+        uint32_t* a_2_read = a_2;
+        uint32_t* a_2_write = a_2;
+        if (k % 2 == 0) {
+            b_write += NumRegSets_b;
+            a_1_write += 2;
+            a_2_write += 4;
+        } else {
+            b_read += NumRegSets_b;
+            a_1_read += 2;
+            a_2_read += 4;
+        }
+        // data loading
+        if (k + 1 < WARP_K_MMA_TENSORS) {
+            // updating SPTR for fragment1 and fragment2
+            read_SPTR_Frag1 += 2 * WARP_SIZE;
+            read_SPTR_Frag2 += 4 * WARP_SIZE;
+            CopyFromSharedToRegister_AFrag<2>(a_1_write, read_SPTR_Frag1);
+            CopyFromSharedToRegister_AFrag<4>(a_2_write, read_SPTR_Frag2);
+            B_FromSharedToReg<TilingConfig>(b_write, read_SPTR, (k + 1) * MMA_16);
+        }
+        // SIMT Dequant + Tensor Core computations
+        Dequant_32FP6_4Way(
+            a, a_1_read, a_2_read, RPTR_Scales);  // Dequantizing FP6 to FP16 at register level,
+                                                  // dequantizing a slice each time
+#pragma unroll
+        for (int i = 0; i < WARP_ROW_MMA_TENSORS; i++) {
+            if (TilingConfig::WARP_COL_MMA_TENSORS == 1)
+                MMA_FP16_M16N8K16(c_uint_ptr[i], a[i], b_read[0]);
+            else {
+#pragma unroll
+                for (int j = 0; j < TilingConfig::WARP_COL_MMA_TENSORS / 2; j++) {
+                    MMA_FP16_M16N8K16(c_uint_ptr[i + j * WARP_ROW_MMA_TENSORS], a[i], b_read[j]);
+                    MMA_FP16_M16N8K16(c_uint_ptr[i + j * WARP_ROW_MMA_TENSORS] + 4,
+                                      a[i],
+                                      b_read[j] + 2);  // c+4; b+2
+                }
+            }
+        }
+    }
+}
+#endif  // #ifdef PIPELINE_LEVEL_SMEM
+
+template <typename TilingConfig>
+__device__ __forceinline__ void StoreToSharedMemoryFromRegister(
+    float (*smem_CFrag)[TilingConfig::TILE_M + PADDING_SHARED_MEM_FOR_C_4],
+    float c[][REG_PER_THREAD_C_TENSOR_16_16])
+{
+    const int lane_id = threadIdx.x % WARP_SIZE;
+    const int warpId = threadIdx.x / WARP_SIZE;
+    int warp_row_offset = warpId * (MMA_16 * WARP_ROW_MMA_TENSORS);
+#pragma unroll
+    for (int i = 0; i < WARP_ROW_MMA_TENSORS; i++) {
+#pragma unroll
+        for (int j = 0; j < TilingConfig::WARP_COL_MMA_TENSORS;
+             j++) {  // Dealing with one 16*8 Tensor
+            int RegSetID = i + (j / 2) * WARP_ROW_MMA_TENSORS;
+            int RegOffset = (j % 2) * (REG_PER_THREAD_C_TENSOR_16_16 / 2);
+            int Tensor_row_offset = warp_row_offset + i * MMA_16;
+            int Tensor_col_offset = j * MMA_8;
+#pragma unroll
+            for (int r = 0; r < REG_PER_THREAD_C_TENSOR_16_16 / 2; r++) {
+                int row_offset = lane_id / 4;
+                if (r >= 2) row_offset += 8;
+                int col_offset = (lane_id % 4) * 2;
+                if (r % 2 == 1) col_offset += 1;
+                smem_CFrag[Tensor_col_offset + col_offset][Tensor_row_offset + row_offset] =
+                    c[RegSetID][r + RegOffset];
+            }
+        }
+    }
+}
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh
new file mode 100644
index 000000000000..62b77edaa37a
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh
@@ -0,0 +1,86 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#ifndef UTILS_GMEM_CUH
+#define UTILS_GMEM_CUH
+
+#include <assert.h>
+#include "configs.h"
+#include "ptx_cp.async.cuh"
+
+/*
+ * Copying A1/A2 from global memory to shared memory.
+ * Usually 1024 or 2048 Bytes
+ */
+template <int SMEM_SIZE_IN_BYTES_PER_WARP>
+__device__ __forceinline__ void CopyFromGlobalToShared_A(uint32_t* SPTR,
+                                                         const uint4* GPTR,
+                                                         bool pred_guard = true)
+{
+#ifdef DEBUG_MODE
+    static_assert(SMEM_SIZE_IN_BYTES_PER_WARP / WARP_SIZE % 16 == 0);
+#endif
+    int lane_id = threadIdx.x % WARP_SIZE;
+    half* SPTR_HALF = reinterpret_cast<half*>(SPTR);
+    const half* GPTR_HALF = reinterpret_cast<const half*>(GPTR);
+    SPTR_HALF += lane_id * 8;
+    GPTR_HALF += lane_id * 8;
+#pragma unroll
+    for (int i = 0; i < SMEM_SIZE_IN_BYTES_PER_WARP / WARP_SIZE / 16; i++) {
+        cp_async<16>(SPTR_HALF, GPTR_HALF, pred_guard);
+        SPTR_HALF += 256;  // Forward 512 Bytes
+        GPTR_HALF += 256;  // Forward 512 Bytes
+    }
+}
+
+/*
+ * Copying 64 Quant Scales (FP16) from global memory to shared memory.
+ */
+__device__ __forceinline__ void CopyFromGlobalToShared_Scales(half* SPTR_QuantScales,
+                                                              const half* GPTR_A_Scales)
+{
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int Offset_Shared = lane_id * 2;
+    int Offset_Global = lane_id / 4 + (lane_id % 4) * 16;
+    for (int i = 0; i < 2; i++)
+        SPTR_QuantScales[Offset_Shared + i] = GPTR_A_Scales[Offset_Global + i * 8];
+}
+
+/*
+ * (1) Copying X  rows * 64 columns of FP16 values, originally in row    major
+ * (2) Copying 64 rows * X  columns of FP16 values, originally in column major
+ * 16 Bytes per thread -> 512 Bytes per WARP = 4 line per WARP = 1 line per 8 Threads
+ */
+template <int MaxNumOfLinesToCopy, int BLOCK_WARPS>
+__device__ __forceinline__ void CopyFromGlobalToShared(
+    half __restrict__ (*SharedPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
+    const half* GlobalPTR,
+    const int GlobalStride,
+    const int NumOfLinesLeft,  // To support arbitrary N dimensions.
+    bool Pred = true)
+{
+    // static parameters: 1 Group (8 Threads) can copy 1 line (64 FP16) each time
+    const int NumOfThreads = BLOCK_WARPS * WARP_SIZE;
+    const int NumOfGroups = NumOfThreads / 8;
+    const int MaxIteration = (MaxNumOfLinesToCopy - 1) / NumOfGroups + 1;
+    // runtime variables
+    const int line_id = threadIdx.x / 8;
+    const int line_offset = (threadIdx.x % 8) * 8;
+    // PTR for source global memory and target shared memory
+    GlobalPTR += line_id * GlobalStride + line_offset;
+    SharedPTR += line_id;
+#pragma unroll
+    for (int i = 0; i < MaxIteration; i++) {
+        bool AsyncCopyPred = (line_id + i * NumOfGroups) < NumOfLinesLeft && Pred;
+        cp_async<16>(&(*SharedPTR)[line_offset], GlobalPTR, AsyncCopyPred);
+        //
+        GlobalPTR += NumOfGroups * GlobalStride;
+        SharedPTR += NumOfGroups;
+    }
+}
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh
new file mode 100644
index 000000000000..ff13868c1347
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh
@@ -0,0 +1,127 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#ifndef UTILS_PARALLELDEQUANT_CUH
+#define UTILS_PARALLELDEQUANT_CUH
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+/*
+ * Input:   R1
+ * Outputs: R1, R2
+ * Note:    Simplified Exponent calculation is applied.
+ */
+__device__ __forceinline__ void FP6_FP16_Cast_4Way(u_int32_t* R1, u_int32_t* R2)
+{
+    *R2 = *R1 & 0x80808080;
+    *R1 = *R1 >> 2;
+    *R1 = *R1 & 0x1f1f1f1f;
+    *R2 = *R2 | *R1;
+    *R1 = *R2 & 0x9f009f00;
+    *R2 = *R2 & 0x009f009f;
+    *R2 = *R2 << 8;
+}
+
+/*
+ * Input:   R1
+ * Outputs: R1, R2
+ * Note:    Simplified Exponent calculation is NOT applied.
+ */
+__device__ __forceinline__ void FP6_FP16_Cast_4Way_Naive(u_int32_t* R1, u_int32_t* R2)
+{
+    //*R2 = *R1 & 0x80808080;
+    *R2 = *R1 & 0xc0c0c0c0;
+    *R1 = *R1 >> 2;
+    //*R1 = *R1 & 0x1f1f1f1f;
+    *R1 = *R1 & 0x0f0f0f0f;
+    *R2 = *R2 | *R1;
+    //
+    //*R1 = *R2 & 0x9f009f00;
+    //*R2 = *R2 & 0x009f009f;
+    *R1 = *R2 & 0xcf00cf00;
+    if (!(*R1 & 0x40000000) && (*R1 & 0x0c000000)) *R1 = *R1 | 0x30000000;
+    if (!(*R1 & 0x00004000) && (*R1 & 0x00000c00)) *R1 = *R1 | 0x00003000;
+    *R2 = *R2 & 0x00cf00cf;
+    if (!(*R2 & 0x00400000) && (*R2 & 0x000c0000)) *R2 = *R2 | 0x00300000;
+    if (!(*R2 & 0x00000040) && (*R2 & 0x0000000c)) *R2 = *R2 | 0x00000030;
+    //
+    *R2 = *R2 << 8;
+    //*R1 = 0x3c003c00;
+    //*R2 = 0x3c003c00;
+}
+
+__device__ __forceinline__ u_int32_t MultScale(u_int32_t PackedFP16Pair, half Scale)
+{
+    half* FP16_1 = reinterpret_cast<half*>(&PackedFP16Pair);
+    half* FP16_2 = FP16_1 + 1;
+    uint32_t output;
+    half* output_half_ptr = reinterpret_cast<half*>(&output);
+    output_half_ptr[0] = __hmul(__hmul(*FP16_1, __float2half(4096.0f)), Scale);
+    output_half_ptr[1] = __hmul(__hmul(*FP16_2, __float2half(4096.0f)), Scale);
+    return output;
+}
+
+__device__ __forceinline__ void Dequant_32FP6_4Way(u_int32_t __restrict__ Reg[][4],
+                                                   u_int32_t __restrict__* read_RPTR_Frag1,
+                                                   u_int32_t __restrict__* read_RPTR_Frag2,
+                                                   u_int32_t* Scales)
+{
+    u_int32_t* OutputRegs = reinterpret_cast<u_int32_t*>(Reg);
+    u_int32_t* Frag1_PTR = read_RPTR_Frag1;
+    u_int32_t* Frag2_PTR = read_RPTR_Frag2;
+    half* Scale_RPTR = reinterpret_cast<half*>(Scales);
+    u_int32_t Packed_FP6 = 0;
+    u_int32_t tmp = 0;
+// Dequantizing 32 FP6, each Loop dequantizing 4 FP6
+#pragma unroll(8)
+    for (int i = 0; i < 8; i++) {
+        // Frag1
+        Packed_FP6 = (*Frag1_PTR) & 0xc0c0c0c0;
+        if (i % 4 == 3)
+            Frag1_PTR++;
+        else
+            (*Frag1_PTR) = (*Frag1_PTR) << 2;
+        // Frag2
+        tmp = (*Frag2_PTR) & 0xf0f0f0f0;
+        tmp = tmp >> 2;
+        if (i % 2 == 1)
+            Frag2_PTR++;
+        else
+            (*Frag2_PTR) = (*Frag2_PTR) << 4;
+        // Packed_FP6
+        Packed_FP6 = Packed_FP6 | tmp;
+        //
+        FP6_FP16_Cast_4Way(&Packed_FP6, &tmp);
+        //
+        *OutputRegs = MultScale(Packed_FP6, Scale_RPTR[0]);  // Muliply FP16 scales
+        OutputRegs += 1;
+        *OutputRegs = MultScale(tmp, Scale_RPTR[1]);  // Muliply FP16 scales
+        OutputRegs += 1;
+        // Updating offset for FP16 scales for every two iterations
+        if (i % 2 == 1) Scale_RPTR += 2;
+    }
+}
+
+/*
+ *
+ */
+__device__ __forceinline__ void ExtractFromSharedToReg_Scales(uint32_t* Scales,
+                                                              half* WARP_SPTR_Scales)
+{
+    int lane_id = threadIdx.x % WARP_SIZE;
+    uint32_t* SPTR_uint = reinterpret_cast<uint32_t*>(WARP_SPTR_Scales);
+    uint32_t tmpReg = SPTR_uint[lane_id];
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+        // T __shfl_sync(unsigned mask, T var, int srcLane, int width=warpSize);
+        Scales[i] = __shfl_sync(0xffffffff, tmpReg, i, 4);
+    }
+}
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h
new file mode 100644
index 000000000000..c8cc7243f341
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h
@@ -0,0 +1,204 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#include <assert.h>
+#include <stdio.h>
+#include <vector>
+
+using namespace std;
+
+void Padding_8_FP6_To_8_Bytes(unsigned char Padded_FP6[],
+                              unsigned char* FP6_Array)  // padding 0 to the lowerest bit location
+{
+    Padded_FP6[0] = FP6_Array[0] & 0xfc;
+    Padded_FP6[1] = (FP6_Array[0] << 6) | ((FP6_Array[1] >> 2) & 0xfc);
+    Padded_FP6[2] = (FP6_Array[1] << 4) | ((FP6_Array[2] >> 4) & 0xfc);
+    Padded_FP6[3] = FP6_Array[2] << 2;
+    Padded_FP6[4] = FP6_Array[3] & 0xfc;
+    Padded_FP6[5] = (FP6_Array[3] << 6) | ((FP6_Array[4] >> 2) & 0xfc);
+    Padded_FP6[6] = (FP6_Array[4] << 4) | ((FP6_Array[5] >> 4) & 0xfc);
+    Padded_FP6[7] = FP6_Array[5] << 2;
+}
+
+unsigned char Extract_2_Bits_From_4_PaddedFP6(unsigned char B1,
+                                              unsigned char B2,
+                                              unsigned char B3,
+                                              unsigned char B4)
+{
+    unsigned char out;
+    out = (B1 & 0xc0) | ((B2 & 0xc0) >> 2) | ((B3 & 0xc0) >> 4) | ((B4 & 0xc0) >> 6);
+    return out;
+}
+
+unsigned char Extract_4_Bits_From_2_PaddedFP6(
+    unsigned char B1,
+    unsigned char
+        B2)  // The highest two bits are already extracted by Extract_2_Bits_From_4_PaddedFP6();
+{
+    unsigned char out;
+    out = ((B1 << 2) & 0xf0) | ((B2 >> 2) & 0x0f);
+    return out;
+}
+
+// dealing with 4 1*8 blocks of FP6
+void Assign_32_FP6_To_4_Thread(vector<unsigned char> Seg_2bit[],
+                               vector<unsigned char> Seg_4bit[],
+                               unsigned char* PTR_1,
+                               unsigned char* PTR_2,
+                               unsigned char* PTR_3,
+                               unsigned char* PTR_4)
+{
+    unsigned char Padded_8_FP8[4][8];
+    Padding_8_FP6_To_8_Bytes(Padded_8_FP8[0], PTR_1);
+    Padding_8_FP6_To_8_Bytes(Padded_8_FP8[1], PTR_2);
+    Padding_8_FP6_To_8_Bytes(Padded_8_FP8[2], PTR_3);
+    Padding_8_FP6_To_8_Bytes(Padded_8_FP8[3], PTR_4);
+    //
+    unsigned char Seg1_Byte1_T[4];
+    unsigned char Seg1_Byte2_T[4];
+    unsigned char Seg2_Byte1_T[4];
+    unsigned char Seg2_Byte2_T[4];
+    unsigned char Seg2_Byte3_T[4];
+    unsigned char Seg2_Byte4_T[4];
+    for (int t = 0; t < 4; t++) {
+        Seg1_Byte1_T[t] = Extract_2_Bits_From_4_PaddedFP6(Padded_8_FP8[0][0 + t * 2],
+                                                          Padded_8_FP8[0][1 + t * 2],
+                                                          Padded_8_FP8[1][0 + t * 2],
+                                                          Padded_8_FP8[1][1 + t * 2]);
+        Seg1_Byte2_T[t] = Extract_2_Bits_From_4_PaddedFP6(Padded_8_FP8[2][0 + t * 2],
+                                                          Padded_8_FP8[2][1 + t * 2],
+                                                          Padded_8_FP8[3][0 + t * 2],
+                                                          Padded_8_FP8[3][1 + t * 2]);
+        Seg2_Byte1_T[t] =
+            Extract_4_Bits_From_2_PaddedFP6(Padded_8_FP8[0][0 + t * 2], Padded_8_FP8[0][1 + t * 2]);
+        Seg2_Byte2_T[t] =
+            Extract_4_Bits_From_2_PaddedFP6(Padded_8_FP8[1][0 + t * 2], Padded_8_FP8[1][1 + t * 2]);
+        Seg2_Byte3_T[t] =
+            Extract_4_Bits_From_2_PaddedFP6(Padded_8_FP8[2][0 + t * 2], Padded_8_FP8[2][1 + t * 2]);
+        Seg2_Byte4_T[t] =
+            Extract_4_Bits_From_2_PaddedFP6(Padded_8_FP8[3][0 + t * 2], Padded_8_FP8[3][1 + t * 2]);
+    }
+    //
+    for (int t = 0; t < 4; t++) {
+        Seg_2bit[t].push_back(Seg1_Byte1_T[t]);
+        Seg_2bit[t].push_back(Seg1_Byte2_T[t]);
+        Seg_4bit[t].push_back(Seg2_Byte1_T[t]);
+        Seg_4bit[t].push_back(Seg2_Byte2_T[t]);
+        Seg_4bit[t].push_back(Seg2_Byte3_T[t]);
+        Seg_4bit[t].push_back(Seg2_Byte4_T[t]);
+    }
+    return;
+}
+
+void BitInterleaving_2bit(unsigned char* PTR_4Bytes)
+{
+    unsigned int* PTR_UINT = reinterpret_cast<unsigned int*>(PTR_4Bytes);
+    unsigned int input = *PTR_UINT;
+    //
+    // int order_2bit[16] = {1,5,9,13,3,7,11,15,2,6,10,14,4,8,12,16};  // pre-defined order for
+    // bit-interleaving in QuantLLM
+    int order_2bit[16] = {
+        2, 6, 10, 14, 4, 8, 12, 16, 1, 5, 9, 13, 3, 7, 11, 15};  // pre-defined order for
+                                                                 // bit-interleaving in QuantLLM
+    unsigned int Frags_2bit[16];  // The highest 2 bits are used to store the extracted fragments.
+    for (int i = 0; i < 16; i++) Frags_2bit[i] = (input << 2 * (order_2bit[i] - 1)) & 0xc0000000;
+    //
+    unsigned int output = 0x00000000;
+    for (int i = 0; i < 16; i++) output |= (Frags_2bit[i] >> (i * 2));
+    //
+    *PTR_UINT = output;
+}
+
+void BitInterleaving_4bit(unsigned char* PTR_4Bytes)
+{
+    unsigned int* PTR_UINT = reinterpret_cast<unsigned int*>(PTR_4Bytes);
+    unsigned int input = *PTR_UINT;
+    //
+    // int order_4bit[8] = {1,5,3,7,2,6,4,8};  // pre-defined order for bit-interleaving in QuantLLM
+    int order_4bit[8] = {
+        2, 6, 4, 8, 1, 5, 3, 7};  // pre-defined order for bit-interleaving in QuantLLM
+    unsigned int Frags_4bit[8];   // The highest4 bits are used to store the extracted fragments.
+    for (int i = 0; i < 8; i++) Frags_4bit[i] = (input << 4 * (order_4bit[i] - 1)) & 0xf0000000;
+    //
+    unsigned int output = 0x00000000;
+    for (int i = 0; i < 8; i++) output |= (Frags_4bit[i] >> (i * 4));
+    //
+    *PTR_UINT = output;
+}
+
+/*
+ * Inputs:
+ * (1) unsigned char Weight_6bit [M*K*6/8]
+ * Outputs:
+ * (1) unsigned char Weight_2bit [M*K*2/8]
+ * (2) unsigned char Weight_4bit [M*K*4/8]
+ *
+ * Assumption: Weight_6bit, Weight_2bit, Weight_4bit all stored continuously in row-major.
+ * 8 FP6 = 6 Bytes
+ * 8 FP4 = 4 Bytes
+ * 8 FP2 = 2 Bytes
+ */
+void weight_matrix_prepacking(int* FP6Weights, size_t M, size_t K)
+{
+    assert(M % 64 == 0);
+    assert(K % 64 == 0);
+    //
+    unsigned char* Weight_6bit = reinterpret_cast<unsigned char*>(FP6Weights);
+    unsigned char* Weight_2bit = Weight_6bit;
+    unsigned char* Weight_4bit = Weight_6bit + M * K * 2 / 8;
+    //
+    vector<unsigned char> A_Segment_2bit[32];
+    vector<unsigned char> A_Segment_4bit[32];
+    //
+    size_t BytesPerRow = K * 6 / 8;
+    // Pass-1: (1) 2+4 split; (2) assign weights to 32 threads.
+    for (size_t i = 0; i < M / 64; i++)  //
+    {
+        for (size_t j = 0; j < K / 16; j++) {
+            for (size_t k = 0; k < 64 / 16; k++) {
+                size_t row = i * 64 + k * 16;
+                size_t col = j * 16;
+                unsigned char* StartPTR_1 = Weight_6bit + row * BytesPerRow + col * 6 / 8;
+                unsigned char* StartPTR_2 = StartPTR_1 + 8 * BytesPerRow;
+                unsigned char* StartPTR_3 = StartPTR_1 + 8 * 6 / 8;
+                unsigned char* StartPTR_4 = StartPTR_2 + 8 * 6 / 8;
+                // Dealing with each 16*16 blocks then...
+                for (int l = 0; l < 8; l++)
+                    Assign_32_FP6_To_4_Thread(&A_Segment_2bit[l * 4],
+                                              &A_Segment_4bit[l * 4],
+                                              StartPTR_1 + l * BytesPerRow,
+                                              StartPTR_2 + l * BytesPerRow,
+                                              StartPTR_3 + l * BytesPerRow,
+                                              StartPTR_4 + l * BytesPerRow);
+            }
+        }
+    }
+    // Verifying the length of 2_bit segments and 4_bit segments
+    size_t BytesPerThread_2bit = M * K * 2 / 8 / 32;
+    size_t BytesPerThread_4bit = M * K * 4 / 8 / 32;
+    for (int i = 0; i < 32; i++) {
+        assert(A_Segment_2bit[i].size() == BytesPerThread_2bit);
+        assert(A_Segment_4bit[i].size() == BytesPerThread_4bit);
+    }
+    // Pass-2: Optimizing coleasced global memory access
+    for (size_t i = 0; i < BytesPerThread_2bit / 4; i++)
+        for (int t = 0; t < 32; t++)
+            for (int b = 0; b < 4; b++)
+                Weight_2bit[i * 128 + t * 4 + (3 - b)] =
+                    A_Segment_2bit[t]
+                                  [i * 4 + b];  // why (3-b): special byte order within a register
+    for (size_t i = 0; i < BytesPerThread_4bit / 4; i++)
+        for (int t = 0; t < 32; t++)
+            for (int b = 0; b < 4; b++)
+                Weight_4bit[i * 128 + t * 4 + (3 - b)] =
+                    A_Segment_4bit[t][i * 4 + b];  // why (3-b):special byte order within a register
+    // Pass-3: Bit-level interleaving
+    for (size_t i = 0; i < BytesPerThread_2bit * 32 / 4; i++)
+        BitInterleaving_2bit(Weight_2bit + 4 * i);
+    for (size_t i = 0; i < BytesPerThread_4bit * 32 / 4; i++)
+        BitInterleaving_4bit(Weight_4bit + 4 * i);
+}
diff --git a/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu b/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu
index 84a9906cf037..cfa62f94596a 100644
--- a/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu
+++ b/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu
@@ -119,7 +119,10 @@ void launch_gated_activation_impl(T* output,
         DISPATCH_UNROLL(5);
     } else if (unroll == 6) {
         DISPATCH_UNROLL(6);
+    } else if (unroll == 7) {
+        DISPATCH_UNROLL(7);
     } else {
+        // TODO: provide a kernel with an outer loop to handle larger columns.
         throw std::runtime_error(
             "Called with more columns than supported, please report this bug and this limit will "
             "be increased.");
diff --git a/deepspeed/inference/v2/model_implementations/__init__.py b/deepspeed/inference/v2/model_implementations/__init__.py
index 869c4316cdc7..14b0654a8c36 100644
--- a/deepspeed/inference/v2/model_implementations/__init__.py
+++ b/deepspeed/inference/v2/model_implementations/__init__.py
@@ -16,3 +16,4 @@
 from .falcon import *
 from .phi import *
 from .qwen import *
+from .qwen_v2 import *
diff --git a/deepspeed/inference/v2/model_implementations/flat_model_helpers.py b/deepspeed/inference/v2/model_implementations/flat_model_helpers.py
index f9da7ac5d23e..ebdb59bca920 100644
--- a/deepspeed/inference/v2/model_implementations/flat_model_helpers.py
+++ b/deepspeed/inference/v2/model_implementations/flat_model_helpers.py
@@ -164,7 +164,7 @@ def process_layer(layer_container: LayerContainer, l_name: str, cur_offset: int)
                                                                    strides=tensor.stride(),
                                                                    offset=cur_offset)
 
-                cur_offset += pad_to_aligned_offset(elem_size(param.dtype) * param.numel())
+                cur_offset += pad_to_aligned_offset(elem_size(tensor.dtype) * tensor.numel())
 
             layer_metadata.params[p_name] = param_metadata
 
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2/__init__.py b/deepspeed/inference/v2/model_implementations/qwen_v2/__init__.py
new file mode 100644
index 000000000000..80b09757c74d
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .policy import Qwen2Policy
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2/container.py b/deepspeed/inference/v2/model_implementations/qwen_v2/container.py
new file mode 100644
index 000000000000..6556d87d6afb
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2/container.py
@@ -0,0 +1,82 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Create a container object to save model-specific tensors using the policy file above.
+
+from ..common_parameters import *
+from ..layer_container_base import LayerContainer
+'''
+ # HF Qwen2 model looks like this:
+
+Qwen2ForCausalLM(
+  (model): Qwen2Model(
+    (embed_tokens): Embedding(151936, 1024)
+    (layers): ModuleList(
+      (0-23): 24 x Qwen2DecoderLayer(
+        (self_attn): Qwen2SdpaAttention(
+          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
+          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
+          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
+          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
+          (rotary_emb): Qwen2RotaryEmbedding()
+        )
+        (mlp): Qwen2MLP(
+          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
+          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
+          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen2RMSNorm()
+        (post_attention_layernorm): Qwen2RMSNorm()
+      )
+    )
+    (norm): Qwen2RMSNorm()
+  )
+  (lm_head): Linear(in_features=1024, out_features=151936, bias=False)
+)
+'''
+
+
+class Qwen2TransformerContainer(LayerContainer):
+    """
+        Transformer layer container for the Qwen2 model.
+    """
+    qkv_w: UnfusedQKVParameter
+    qkv_b: UnfusedQKVParameter
+    attn_out_w: AttentionOutputParameter
+    mlp_1_w: GatedMLPParameter
+    mlp_2_w: MLP2Parameter
+    attn_norm_gamma: NormParameter
+    mlp_norm_gamma: NormParameter
+
+    PARAM_MAPPING = {
+        "self_attn.q_proj.weight": "qkv_w.q_params",
+        "self_attn.k_proj.weight": "qkv_w.k_params",
+        "self_attn.v_proj.weight": "qkv_w.v_params",
+        "self_attn.q_proj.bias": "qkv_b.q_params",
+        "self_attn.k_proj.bias": "qkv_b.k_params",
+        "self_attn.v_proj.bias": "qkv_b.v_params",
+        "self_attn.o_proj.weight": "attn_out_w.params",
+        "mlp.gate_proj.weight": "mlp_1_w.gate_params",
+        "mlp.up_proj.weight": "mlp_1_w.up_params",
+        "mlp.down_proj.weight": "mlp_2_w.params",
+        "input_layernorm.weight": "attn_norm_gamma.params",
+        "post_attention_layernorm.weight": "mlp_norm_gamma.params",
+    }
+
+
+class Qwen2NonTransformerContainer(LayerContainer):
+    """
+        Non-Transformer layer container for the Qwen2 model.
+    """
+    word_emb: EmbeddingParameter
+    word_unembed: UnembedParameter
+    final_norm: NormParameter
+
+    PARAM_MAPPING = {
+        "model.embed_tokens.weight": "word_emb.params",
+        "model.norm.weight": "final_norm.params",
+        "lm_head.weight": "word_unembed.params",
+    }
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2/model.py b/deepspeed/inference/v2/model_implementations/qwen_v2/model.py
new file mode 100644
index 000000000000..d535462a954d
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2/model.py
@@ -0,0 +1,221 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+
+import deepspeed.comm as dist
+
+from ...allocator import empty_from
+from ...inference_utils import ActivationType, DtypeEnum
+from .. import *
+from ...modules.configs import *
+from ...modules.interfaces import *
+from ...modules import heuristics
+from ...ragged import RaggedBatchWrapper
+
+from .container import Qwen2NonTransformerContainer, Qwen2TransformerContainer
+
+
+class Qwen2InferenceModel(DSTransformerModelBase):
+    """
+    Inference model implementation for ragged batching for Llama-2 models.
+    """
+
+    _non_transformer: Optional[Qwen2NonTransformerContainer]
+    """
+    Embed + unembed container. Specializing the type annotation.
+    """
+
+    _transformer: Optional[Iterable[Qwen2TransformerContainer]]
+    """
+    Per-layer transformer container. Specializing the type annotation.
+    """
+    """
+    Properties ineherited from `DSInferenceModelBase`
+    """
+
+    @property
+    def max_sequence_length(self) -> int:
+        return self._config.max_seq_length
+
+    """
+    Properties ineherited from `DSTransformerModelBase`
+    """
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.num_hidden_layers
+
+    @property
+    def model_dim(self) -> int:
+        return self._config.hidden_size
+
+    @property
+    def vocab_size(self) -> int:
+        return self._config.vocab_size
+
+    @property
+    def head_size(self) -> int:
+        return self.model_dim // self.n_heads
+
+    @property
+    def n_heads(self) -> int:
+        return self._config.num_attention_heads
+
+    @property
+    def intermediate_dim(self) -> int:
+        return self._config.intermediate_size
+
+    @property
+    def n_heads_kv(self) -> int:
+        return self._config.num_key_value_heads
+
+    @property
+    def activation_dtype(self) -> DtypeEnum:
+        # TODO(ZonePG): bf16 inference results may be different from huggingface bf16,
+        # because in rms_norm, Qwen still use float() instead of bf16
+        # if self._config.torch_dtype == torch.float16:
+        #     return DtypeEnum.fp16
+        # elif self._config.torch_dtype == torch.bfloat16:
+        #     return DtypeEnum.bf16
+        # else:
+        #     raise NotImplementedError("Only fp16 and bf16 are supported")
+        return DtypeEnum.fp16
+
+    @property
+    def mlp_activation_fn(self) -> ActivationType:
+        return ActivationType.SiGLU
+
+    @property
+    def norm_type(self) -> NormTypeEnum:
+        return NormTypeEnum.RMSNorm
+
+    @property
+    def positional_embedding_type(self) -> PositionalEmbeddingType:
+        return PositionalEmbeddingType.rotate_half
+
+    @property
+    def positional_embedding_config(self) -> Optional[RotateHalfConfig]:
+        return RotateHalfConfig(theta_base=self._config.rope_theta)
+
+    def make_norm_layer(self) -> None:
+        """
+        Instantiates the normalization layer for the model. This sets the `self.norm` attribute.
+
+        TODO(cmikeh2): In the future we'll distinguish between the different norm objects,
+        but for now we'll just use the same one for all of them.
+        """
+        norm_config = DSNormConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            type=self.norm_type,
+            channels=self.model_dim,
+            residual_dtype=self.activation_dtype,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+            eps=self._config.rms_norm_eps,
+        )
+
+        self.norm = heuristics.instantiate_pre_norm(norm_config, self._engine_config)
+
+    """
+    Forward implementations
+    """
+
+    def _forward_embed(self, ragged_batch: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs the embedding lookup prior to running the transformer of the model.
+
+        Arguments:
+            ragged_batch (RaggedBatchWrapper): The batch to embed.
+
+        Returns:
+            torch.Tensor: The embedded batch.
+        """
+        embed = self.embed(ragged_batch, self._non_transformer.word_emb)
+
+        if embed.shape[-1] != self.model_dim:
+            raise ValueError(f"Embedding output shape {embed.shape} does not match model_dim {self.model_dim}")
+
+        return embed
+
+    def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hidden_states: torch.Tensor,
+                                   ragged_batch_info: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead
+        optimization to fuse the layer norm of the next layer into the current layer.
+
+        Arguments:
+            layer_idx (int): The index of the layer to execute.
+            residual (torch.Tensor): The residual tensor from the previous layer.
+            hidden_states (torch.Tensor): The hidden states from the previous layer. This is the
+                hidden states after pre normalization.
+            ragged_batch_info (RaggedBatchWrapper): The batch metadata.
+        """
+        # TODO(cmikeh2): Distribute ragged_batch_info to all modules
+
+        cur_params = self._transformer[layer_idx]
+        kv_cache = self.state_manager.get_cache(layer_idx)
+
+        hidden_states = self.qkv(hidden_states, cur_params.qkv_w, b=cur_params.qkv_b)
+        hidden_states = self.attn(hidden_states, kv_cache, ragged_batch_info)
+        hidden_states = self.attn_out(hidden_states, cur_params.attn_out_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        residual, hidden_states = self.norm(residual, hidden_states, cur_params.mlp_norm_gamma, beta=None)
+
+        # Should be configurable in the future
+        hidden_states = self.mlp_1(hidden_states, cur_params.mlp_1_w, b=None)
+        hidden_states = self.mlp_2(hidden_states, cur_params.mlp_2_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        if layer_idx != self.num_layers - 1:
+            next_params = self._transformer[layer_idx + 1]
+            residual, hidden_states = self.norm(residual, hidden_states, next_params.attn_norm_gamma, beta=None)
+        else:
+            # On last layer, we just need to perform the residual add. Adding into the residual
+            # here is safe.
+            residual.add_(hidden_states)
+
+        return residual, hidden_states
+
+    def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs unembedding of the hidden states to logits. This will only sample the final
+        token of each sequence.
+        """
+        logits = self.unembed(hidden_states,
+                              self._non_transformer.word_unembed,
+                              ragged_batch_info,
+                              gamma=self._non_transformer.final_norm)
+
+        if self.tp_size > 1:
+            comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
+            full_logits = empty_from(self._return_logits, (logits.shape[0], self.vocab_size))
+
+            dist.all_gather_into_tensor(comm_buffer, logits, group=self._base_mp_group)
+
+            full_logits.copy_(comm_buffer.permute(1, 0, 2).reshape(logits.shape[0], self.vocab_size))
+
+            return full_logits
+        else:
+            return logits
+
+    def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor:
+
+        residual = self._forward_embed(wrapped_batch)
+
+        residual, hidden_states = self.norm(residual, None, self._transformer[0].attn_norm_gamma, beta=None)
+
+        for layer_idx in range(self.num_layers):
+            residual, hidden_states = self._forward_transformer_layer(layer_idx, residual, hidden_states,
+                                                                      wrapped_batch)
+
+        return self._forward_unembed(residual, wrapped_batch)
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2/policy.py b/deepspeed/inference/v2/model_implementations/qwen_v2/policy.py
new file mode 100644
index 000000000000..9c5db2ba0065
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2/policy.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any
+
+from ...config_v2 import RaggedInferenceEngineConfig
+from ..inference_policy_base import ContainerMap, InferenceV2Policy
+from .container import Qwen2NonTransformerContainer, Qwen2TransformerContainer
+from .model import Qwen2InferenceModel
+
+
+class Qwen2Policy(InferenceV2Policy):
+
+    def instantiate_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> Qwen2InferenceModel:
+        return Qwen2InferenceModel(config=self._model_config, engine_config=engine_config, base_mp_group=mp_group)
+
+    def build_container_map(self) -> ContainerMap:
+        map = ContainerMap()
+
+        transformer_containers = [Qwen2TransformerContainer(self.model) for _ in range(self.model.num_layers)]
+
+        map.set_transformer_params(['model.layers'], transformer_containers)
+
+        map.set_non_transformer_params(Qwen2NonTransformerContainer(self.model))
+
+        map.set_unmapped_params(
+            [f'model.layers.{i}.self_attn.rotary_emb.inv_freq' for i in range(self.model.num_layers)])
+
+        return map
diff --git a/deepspeed/inference/v2/modules/heuristics.py b/deepspeed/inference/v2/modules/heuristics.py
index b89e95c0d834..d176206f3c60 100644
--- a/deepspeed/inference/v2/modules/heuristics.py
+++ b/deepspeed/inference/v2/modules/heuristics.py
@@ -86,8 +86,15 @@ def instantiate_linear(linear_config: DSLinearConfig, engine_config: RaggedInfer
         A linear module implementing the given configuration.
     """
 
-    # Currently, we only have one implementation, so we just return it.
-    config = ConfigBundle(name="blas_fp_linear", config=linear_config)
+    quantization_mode = engine_config.quantization.quantization_mode
+    if quantization_mode is None:
+        config = ConfigBundle(name="blas_fp_linear", config=linear_config)
+    else:
+        # Currently, we only support ``quantized_wf6af16_linear``.
+        if quantization_mode == "wf6af16":
+            config = ConfigBundle(name="quantized_wf6af16_linear", config=linear_config)
+        else:
+            raise ValueError(f"Unsupported quantization mode: {quantization_mode}")
     return DSLinearRegistry.instantiate_config(config)
 
 
diff --git a/deepspeed/inference/v2/modules/implementations/linear/__init__.py b/deepspeed/inference/v2/modules/implementations/linear/__init__.py
index e76aab71c4cf..0501af54c4e6 100644
--- a/deepspeed/inference/v2/modules/implementations/linear/__init__.py
+++ b/deepspeed/inference/v2/modules/implementations/linear/__init__.py
@@ -4,3 +4,4 @@
 # DeepSpeed Team
 
 from .blas_fp_linear import BlasFPLinear
+from .quantized_linear import QuantizedWf6Af16Linear, fp_quantize
diff --git a/deepspeed/inference/v2/modules/implementations/linear/quantized_linear.py b/deepspeed/inference/v2/modules/implementations/linear/quantized_linear.py
new file mode 100644
index 000000000000..933cf55b2391
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/linear/quantized_linear.py
@@ -0,0 +1,205 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any, Dict, Optional
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import InferenceCoreBuilder
+from ....allocator import empty_from
+from ....inference_utils import is_gated
+from ....kernels.core_ops import (
+    CUDAWf6Af16Linear,
+    CUDABiasActivation,
+    CUDAGatedActivation,
+)
+
+from ...interfaces import DSLinearBase, DSLinearRegistry
+from ...configs import DSLinearConfig
+from ....inference_parameter import InferenceParameter
+
+
+def fp_quantize(input: torch.FloatTensor,
+                num_bits: int = 6,
+                exp_bits: int = 3,
+                min_value: torch.FloatTensor = None,
+                max_value: torch.FloatTensor = None,
+                group_size: int = -1):
+    """
+    Args:
+        inputs (`torch.FloatTensor`)
+            The input which needs to be quantized
+        num_bits (int, >=4)
+            Number of bits to use for quantization
+        exp_bits:
+            fp exp_bits
+        min_value/max_vlue (torch.FloatTensor)
+            Used for static activation quantization
+        group_size (int) N
+            The quantization block size, each N numbers has its own scaling
+            factor and off-site. -1 means use the last dim as the group_size
+    Returns:
+        quantized_fake_fp6
+            The quantized weights, in fp16 format and contains fp6 value.
+        scales
+            Quantization scales
+    """
+
+    try:
+        from qtorch.quant import float_quantize
+    except ImportError:
+        raise ImportError("Please install qtorch to use this function")
+
+    assert (min_value is None and max_value is None) or (min_value is not None and max_value is not None)
+
+    assert input.dtype == torch.float16
+
+    orig_device = input.device
+    input = input.to(torch.float32).to(get_accelerator().current_device())
+    if num_bits == 6 and exp_bits == 3:  # this is default
+        q_range = 28
+    else:
+        raise NotImplementedError
+
+    man_bits = num_bits - exp_bits - 1
+    input_shape = input.shape
+
+    if group_size == -1:
+        group_size = input_shape[-1]
+    else:
+        # Only support per-channel quantization
+        raise NotImplementedError
+    num_groups = input.numel() // group_size
+    input = input.reshape(num_groups, -1)
+
+    if min_value is None:
+        max_input = torch.amax(torch.abs(input), dim=-1).view(num_groups, -1)
+    else:
+        max_input = torch.max(min_value.abs(), max_value)  # .view(-1)
+    scales = max_input / q_range  # q_range + 1
+    scales[scales == 0] = 1  # avoid zero scales
+    scaled_input = input / scales
+
+    quantized_fake_fp6 = float_quantize(scaled_input, exp_bits, man_bits, rounding="nearest")
+
+    quantized_fake_fp6 = quantized_fake_fp6.reshape(input_shape).contiguous().to(torch.float16).to(orig_device)
+    scales = scales.to(torch.float16).to(orig_device)
+    # Now the dequantized value is quantized_fake_fp6 * scales
+
+    return quantized_fake_fp6, scales
+
+
+@DSLinearRegistry.register_module
+class QuantizedWf6Af16Linear(DSLinearBase):
+    """
+    Linear DSModule for FP6 weight-only quantization kernel, where weight is FP6
+    and activation is FP16.
+    """
+
+    @staticmethod
+    def name():
+        return 'quantized_wf6af16_linear'
+
+    @staticmethod
+    def supports_config(config: DSLinearConfig) -> bool:
+        if config.input_dtype != config.output_dtype:
+            return False
+
+        # As for fp6 data items, they are packed and stored in a set of fp16
+        # tensors. E.g., 8 fp6 data items are stored in 3 fp16 tensor.
+        if config.input_dtype != torch.float16:
+            return False
+
+        if is_gated(config.activation):
+            try:
+                _ = CUDAGatedActivation(config.out_channels, config.output_dtype, config.activation)
+            except ValueError:
+                return False
+        else:
+            try:
+                _ = CUDABiasActivation(config.out_channels, config.output_dtype, config.activation)
+            except ValueError:
+                return False
+
+        return True
+
+    def __init__(self, config: DSLinearConfig, implementation_config: Dict[str, Any]) -> None:
+        super().__init__(config, implementation_config)
+
+        self._linear_impl = CUDAWf6Af16Linear()
+
+        if is_gated(config.activation):
+            # In the FP6 kernel implementation, the MatMul is W * A, where W is
+            # the weight and A is activation. M is the output channel size.
+            self.out_channels = self._config.out_channels * 2
+            self.in_channels = self._config.in_channels
+            self._is_gated = True
+            self._act_fn = CUDAGatedActivation(config.out_channels, config.output_dtype, config.activation)
+            self._double_buffer = torch.empty((config.max_tokens, config.out_channels * 2),
+                                              dtype=config.output_dtype,
+                                              device=get_accelerator().current_device())
+        else:
+            self.out_channels = self._config.out_channels
+            self.in_channels = self._config.in_channels
+            self._is_gated = False
+            self._act_fn = CUDABiasActivation(config.out_channels, config.output_dtype, config.activation)
+
+        self._output = torch.empty((config.max_tokens, config.out_channels),
+                                   dtype=config.output_dtype,
+                                   device=get_accelerator().current_device())
+
+        self.inf_module = InferenceCoreBuilder().load()
+        self.inf_module.create_handle()
+        self.preprocess_weight = self.inf_module.preprocess_weight
+
+        self.quantizer = fp_quantize
+
+    def transform_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Converts param to same data type as input and output.
+
+        Parameters:
+            param (torch.Tensor): Weight or bias tensor.
+        """
+        # It expects that the quantization scales are store in the attribute `scales`.
+
+        if param.ndim == 1:  # bias, do nothing
+            return InferenceParameter.initialize(param)
+
+        quantized_fake_fp6, scales = self.quantizer(param, num_bits=6, exp_bits=3)
+
+        # This is for debugging, will delete before release.
+        assert (quantized_fake_fp6.dtype == torch.float16)
+        assert quantized_fake_fp6.shape[0] == self.out_channels
+        assert scales.numel() == self.out_channels
+
+        weights_2bit, weights_4bit = self.preprocess_weight(quantized_fake_fp6)
+
+        return InferenceParameter.initialize(weights_2bit, weights_4bit=weights_4bit, scales=scales)
+
+    def forward(self, hidden_states: torch.Tensor, w: torch.Tensor, b: Optional[torch.Tensor] = None) -> torch.Tensor:
+        weights_2bit = w
+        weights_4bit = w.weights_4bit
+        scales = w.scales
+        output = empty_from(self._output, (hidden_states.shape[0], self._config.out_channels))
+        if self._is_gated:
+            staging_output = empty_from(self._double_buffer, (hidden_states.shape[0], self.out_channels))
+            self._linear_impl(staging_output, hidden_states, weights_2bit, weights_4bit, scales, self.out_channels,
+                              hidden_states.shape[0], self.in_channels)
+            self._act_fn(output, staging_output, b)
+        else:
+            self._linear_impl(output, hidden_states, weights_2bit, weights_4bit, scales, self.out_channels,
+                              hidden_states.shape[0], self.in_channels)
+            self._act_fn(output, b)
+
+        return output
+
+    @property
+    def output(self) -> torch.Tensor:
+        """
+        Return the padded, pre-allocated output Tensor.
+        """
+        return self._output
diff --git a/deepspeed/module_inject/fusedqkv_utils.py b/deepspeed/module_inject/fusedqkv_utils.py
index ba238cba7508..cf087c16da8a 100644
--- a/deepspeed/module_inject/fusedqkv_utils.py
+++ b/deepspeed/module_inject/fusedqkv_utils.py
@@ -113,7 +113,7 @@ def _transpose_fused_qkvw(src, mp_size, fused_qkv_type=None, module=None):
 
         raise ValueError("unknown fused_qkv_type")
 
-    module_name_matches = [k for k in fused_type_dict.keys() if module_str in k]
+    module_name_matches = [k for k in fused_type_dict.keys() if k in module_str]
     if module_name_matches:
         # There can be overlap with matches (e.g., "DecoderLayer" and "FalconDecoderLayer").
         # We take the longest matching module_name
diff --git a/deepspeed/moe/layer.py b/deepspeed/moe/layer.py
index 46f7924ac038..dfa9fcf4f464 100644
--- a/deepspeed/moe/layer.py
+++ b/deepspeed/moe/layer.py
@@ -32,6 +32,7 @@ class MoE(nn.Module):
         use_rts (bool, optional): default=True, whether to use Random Token Selection.
         use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed).
         enable_expert_tensor_parallelism (bool, optional): default=False, whether to use tensor parallelism for experts
+        top2_2nd_expert_sampling (bool, optional): default=True, whether to perform sampling for 2nd expert
     """
 
     def __init__(self,
@@ -48,7 +49,8 @@ def __init__(self,
                  drop_tokens: bool = True,
                  use_rts: bool = True,
                  use_tutel: bool = False,
-                 enable_expert_tensor_parallelism: bool = False) -> None:
+                 enable_expert_tensor_parallelism: bool = False,
+                 top2_2nd_expert_sampling: bool = True) -> None:
 
         super(MoE, self).__init__()
 
@@ -69,7 +71,8 @@ def __init__(self,
 
         experts = Experts(expert, self.num_local_experts, self.expert_group_name)
         self.deepspeed_moe = MOELayer(TopKGate(hidden_size, num_experts, k, capacity_factor, eval_capacity_factor,
-                                               min_capacity, noisy_gate_policy, drop_tokens, use_rts),
+                                               min_capacity, noisy_gate_policy, drop_tokens, use_rts,
+                                               top2_2nd_expert_sampling),
                                       experts,
                                       self.expert_group_name,
                                       self.ep_size,
diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py
index d92211b9d220..d6c023ec11d3 100644
--- a/deepspeed/moe/sharded_moe.py
+++ b/deepspeed/moe/sharded_moe.py
@@ -95,11 +95,7 @@ def gumbel_rsample(shape: Tuple, device: torch.device) -> Tensor:
 class _AllToAll(torch.autograd.Function):
 
     @staticmethod
-    def forward(
-            ctx: Any,
-            # TODO: replace with DS process group
-            group: torch.distributed.ProcessGroup,
-            input: Tensor) -> Tensor:  # type: ignore
+    def forward(ctx: Any, group: dist.ProcessGroup, input: Tensor) -> Tensor:  # type: ignore
         ctx.group = group
         input = input.contiguous()
         output = torch.empty_like(input)
@@ -214,6 +210,11 @@ def top1gating(logits: Tensor,
     if not drop_tokens:
         new_capacity = torch.max(exp_counts).to(logits.device)
         dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=dist.get_world_group())
+        if groups._get_expert_model_parallel_world_size() == 1:
+            # If the non-expert is tensor-parallel, we need to pad the capacity to 'tp'.
+            # This is since we are going to activate drop_tokens() to drop duplicate tokens.
+            tp = 1 if groups.mpu is None else groups.mpu.get_tensor_model_parallel_world_size()
+            new_capacity = torch.ceil(new_capacity / tp).mul(tp).to(new_capacity.dtype)
         capacity = new_capacity
 
     # Compute l_aux
@@ -279,23 +280,27 @@ def top1gating(logits: Tensor,
     return l_aux, combine_weights, dispatch_mask, exp_counts
 
 
-def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+def top2gating(logits: Tensor,
+               capacity_factor: float,
+               min_capacity: int,
+               drop_tokens: bool = True,
+               top2_2nd_expert_sampling: bool = True) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     """Implements Top2Gating on logits."""
     # everything is in fp32 in this function
     gates = F.softmax(logits, dim=1)
 
-    capacity = _capacity(gates, torch.tensor(capacity_factor * 2), torch.tensor(min_capacity))
-
     # Create a mask for 1st's expert per token
     indices1_s = torch.argmax(gates, dim=1)
     num_experts = int(gates.shape[1])
     mask1 = F.one_hot(indices1_s, num_classes=num_experts)
 
-    # Create a mask for 2nd's expert per token using Gumbel-max trick
-    # https://timvieira.github.io/blog/post/2014/07/31/gumbel-max-trick/
-    logits_w_noise = logits + gumbel_rsample(logits.shape, device=logits.device)
+    if top2_2nd_expert_sampling:
+        # Create a mask for 2nd's expert per token using Gumbel-max trick
+        # https://timvieira.github.io/blog/post/2014/07/31/gumbel-max-trick/
+        logits += gumbel_rsample(logits.shape, device=logits.device)
+
     # Replace top-expert with min value
-    logits_except1 = logits_w_noise.masked_fill(mask1.bool(), float("-inf"))
+    logits_except1 = logits.masked_fill(mask1.bool(), float("-inf"))
     indices2_s = torch.argmax(logits_except1, dim=1)
     mask2 = F.one_hot(indices2_s, num_classes=num_experts)
 
@@ -305,17 +310,29 @@ def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tup
     # Update 2nd's location by accounting for locations of 1st
     locations2 += torch.sum(mask1, dim=0, keepdim=True)
 
-    # gating decisions
-    exp_counts = torch.sum(mask1, dim=0).detach().to('cpu')
-
     # Compute l_aux
     me = torch.mean(gates, dim=0)
     ce = torch.mean(mask1.float(), dim=0)
     l_aux = torch.mean(me * ce) * num_experts * num_experts
 
-    # Remove locations outside capacity from mask
-    mask1 *= torch.lt(locations1, capacity)
-    mask2 *= torch.lt(locations2, capacity)
+    # gating decisions
+    exp_counts = torch.sum(mask1 + mask2, dim=0)
+
+    if drop_tokens:
+        # Calculate configured capacity and remove locations outside capacity from mask
+        capacity = _capacity(gates, torch.tensor(capacity_factor * 2), torch.tensor(min_capacity))
+        mask1 *= torch.lt(locations1, capacity)
+        mask2 *= torch.lt(locations2, capacity)
+    else:
+        # Do not drop tokens - set capacity according to current expert assignments
+        new_capacity = torch.max(exp_counts)
+        dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=dist.get_world_group())
+        if groups._get_expert_model_parallel_world_size() == 1:
+            # If the non-expert is tensor-parallel, we need to pad the capacity to 'tp'.
+            # This is since we are going to activate drop_tokens() to drop duplicate tokens.
+            tp = 1 if groups.mpu is None else groups.mpu.get_tensor_model_parallel_world_size()
+            new_capacity = torch.ceil(new_capacity / tp).mul(tp).to(new_capacity.dtype)
+        capacity = new_capacity
 
     # Store the capacity location for each token
     locations1_s = torch.sum(locations1 * mask1, dim=1)
@@ -342,7 +359,7 @@ def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tup
     combine_weights = combine1_sec + combine2_sec
     dispatch_mask = combine_weights.bool()
 
-    return l_aux, combine_weights, dispatch_mask, exp_counts
+    return l_aux, combine_weights, dispatch_mask, exp_counts.detach().to('cpu')
 
 
 class TopKGate(Module):
@@ -372,13 +389,14 @@ def __init__(self,
                  min_capacity: int = 8,
                  noisy_gate_policy: Optional[str] = None,
                  drop_tokens: bool = True,
-                 use_rts: bool = True) -> None:
+                 use_rts: bool = True,
+                 top2_2nd_expert_sampling: bool = True) -> None:
         super().__init__()
 
         # Only top-1 and top-2 are supported at the moment.
         if k != 1 and k != 2:
             raise ValueError('Only top-1 and top-2 gatings are supported.')
-        self.wg = torch.nn.Linear(model_dim, num_experts, bias=False).float()
+        self.wg = torch.nn.Linear(model_dim, num_experts, bias=False)
         self.k = k
         self.capacity_factor = capacity_factor
         self.eval_capacity_factor = eval_capacity_factor
@@ -389,6 +407,7 @@ def __init__(self,
         self.gate_time = 0.0
         self.drop_tokens = drop_tokens
         self.use_rts = use_rts
+        self.top2_2nd_expert_sampling = top2_2nd_expert_sampling
 
     def forward(self,
                 input: torch.Tensor,
@@ -398,13 +417,11 @@ def forward(self,
         if self.wall_clock_breakdown:
             self.timers(TOPK_GATE_TIMER).start()
 
-        if self.wg.weight.dtype != torch.float32:
-            self.wg = self.wg.float()
         input_fp32 = input.float()
         # input jittering
         if self.noisy_gate_policy == 'Jitter' and self.training:
             input_fp32 = multiplicative_jitter(input_fp32, device=input.device)
-        logits = self.wg(input_fp32)
+        logits = torch.nn.functional.linear(input_fp32, weight=self.wg.weight.float(), bias=None)
 
         if self.k == 1:
             gate_output = top1gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor,
@@ -413,7 +430,7 @@ def forward(self,
 
         else:
             gate_output = top2gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor,
-                                     self.min_capacity)
+                                     self.min_capacity, self.drop_tokens, self.top2_2nd_expert_sampling)
 
         if self.wall_clock_breakdown:
             self.timers(TOPK_GATE_TIMER).stop()
diff --git a/deepspeed/runtime/comm/coalesced_collectives.py b/deepspeed/runtime/comm/coalesced_collectives.py
index d63d7e985e07..543795126fab 100644
--- a/deepspeed/runtime/comm/coalesced_collectives.py
+++ b/deepspeed/runtime/comm/coalesced_collectives.py
@@ -12,8 +12,7 @@
 import torch
 from torch import Tensor
 from deepspeed import comm as dist
-# NOTE: Use torch.distributed's ProcessGroup class until we have our own.
-from torch.distributed import ProcessGroup, all_to_all_single
+from deepspeed.comm import ProcessGroup, all_to_all_single
 from deepspeed.accelerator import get_accelerator
 from deepspeed.utils import instrument_w_nvtx
 from deepspeed.ops import op_builder
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 5cf655d8741a..142259c1b7df 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -1635,19 +1635,16 @@ def _partition_param_sec(self, param, buffer=None, has_been_updated=False):
             secondary_end = secondary_start + secondary_partition_size
 
             one_dim_param = param.contiguous().view(-1)
-            start = partition_size * self.rank
-            end = start + partition_size
-            if start < param.ds_numel and end <= param.ds_numel:
-                if secondary_start < param.ds_numel and secondary_end <= param.ds_numel:
-                    sec_src_tensor = one_dim_param.narrow(0, secondary_start, secondary_partition_size)
-                    param.ds_secondary_tensor.copy_(sec_src_tensor)
 
-            else:
-                if start < param.ds_numel:
-                    elements_to_copy = param.ds_numel - start
-                    elements_to_copy_sec = elements_to_copy * param.ds_secondary_tensor_num_of_groups
-                    param.ds_secondary_tensor.narrow(0, 0, elements_to_copy_sec).copy_(
-                        one_dim_param.narrow(0, secondary_start, elements_to_copy_sec))
+            # ds_numel is unpadded, so the last chunk of the secondary tensor might not be secondary_partition_size
+            sec_numel = param.ds_numel - secondary_start if secondary_end > param.ds_numel else secondary_partition_size
+
+            # copy from full tensor to secondary tensor
+            param.ds_secondary_tensor.narrow(0, 0,
+                                             sec_numel).copy_(one_dim_param.narrow(0, secondary_start, sec_numel))
+
+            # TODO: This is a temporary fix to avoid the issue that 2nd tensor all-gather happens before 2nd tensor partition is done
+            get_accelerator().current_stream().synchronize()
 
             print_rank_0(f"{param.ds_id} partitioned type {param.dtype} dev {param.device} shape {param.shape}",
                          force=False)
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index e4009f6ac883..71a01b2391f8 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -1946,8 +1946,10 @@ def _average_expert_grad_norms(self, norm_groups):
         for i, norm in enumerate(norm_groups):
             if self.is_moe_param_group[i]:
                 scaled_norm_tensor = norm * 1.0 / dist.get_world_size(group=self.real_dp_process_group[i])
+                if self.device == 'cpu':
+                    scaled_norm_tensor = scaled_norm_tensor.to(get_accelerator().current_device_name())
                 dist.all_reduce(scaled_norm_tensor, group=self.real_dp_process_group[i])
-                norm_groups[i] = scaled_norm_tensor
+                norm_groups[i] = scaled_norm_tensor.to(self.device)
 
     def unscale_and_clip_grads(self, grad_groups_flat, total_norm):
         # compute combined scale factor for this group
diff --git a/docs/index.md b/docs/index.md
index e1b6609d3bfe..e3351ee1a3d7 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -7,6 +7,7 @@ title: "Latest News"
 ---
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
+* [2024/03] [DeepSpeed-FP6: The Power of FP6-Centric Serving for Large Language Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)]
 * [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
 * [2023/11] [Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed](https://github.com/microsoft/DeepSpeed/tree/master/blogs/intel-inference) [[Intel version]](https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html)
 * [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp)
diff --git a/op_builder/inference_core_ops.py b/op_builder/inference_core_ops.py
index 8073b63ad16b..3c53774d0a50 100755
--- a/op_builder/inference_core_ops.py
+++ b/op_builder/inference_core_ops.py
@@ -57,6 +57,8 @@ def get_prefix(self):
         return "deepspeed" if os.path.isdir(ds_path) else ".."
 
     def sources(self):
+        import torch
+
         sources = [
             "inference/v2/kernels/core_ops/core_ops.cpp",
             "inference/v2/kernels/core_ops/bias_activations/bias_activation.cpp",
@@ -69,6 +71,15 @@ def sources(self):
             "inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu",
         ]
 
+        # The source files with specific GPU architecture requirements.
+        if not self.is_rocm_pytorch() and torch.cuda.is_available():  #ignore-cuda
+            cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
+            if cuda_capability != 8:
+                self.warning("FP6 quantization kernel is only supported on Ampere architectures")
+            else:
+                sources.append("inference/v2/kernels/core_ops/cuda_linear/fp6_linear.cu")
+                sources.append("inference/v2/kernels/core_ops/cuda_linear/cuda_linear_kernels.cpp")
+
         prefix = self.get_prefix()
         sources = [os.path.join(prefix, src) for src in sources]
         return sources
@@ -83,6 +94,7 @@ def include_paths(self):
             'inference/v2/kernels/core_ops/cuda_layer_norm',
             'inference/v2/kernels/core_ops/cuda_rms_norm',
             'inference/v2/kernels/core_ops/gated_activations',
+            'inference/v2/kernels/core_ops/cuda_linear',
             'inference/v2/kernels/includes',
         ]
 
diff --git a/requirements/requirements-inf.txt b/requirements/requirements-inf.txt
index 7a40ae814cbe..b7fd13787e8b 100644
--- a/requirements/requirements-inf.txt
+++ b/requirements/requirements-inf.txt
@@ -1,6 +1,7 @@
 google
 lm-eval==0.3.0
 protobuf
+qtorch
 safetensors
 sentencepiece
 transformers>=4.32.1
diff --git a/tests/unit/inference/v2/modules/test_quantized_linear_module.py b/tests/unit/inference/v2/modules/test_quantized_linear_module.py
new file mode 100644
index 000000000000..a7bd965072ac
--- /dev/null
+++ b/tests/unit/inference/v2/modules/test_quantized_linear_module.py
@@ -0,0 +1,184 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.inference_utils import ActivationType, DtypeEnum, is_gated
+from deepspeed.inference.v2.modules import ConfigBundle
+from deepspeed.inference.v2.modules.configs import DSLinearConfig
+from deepspeed.inference.v2.modules.interfaces import DSLinearRegistry
+from ...v2.inference_test_utils import allclose
+
+
+def reference_implementation(hidden_states: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor],
+                             act_type: ActivationType) -> torch.Tensor:
+    dtype = hidden_states.dtype
+    out_states = torch.nn.functional.linear(hidden_states, weight, bias)
+    out_states.float()
+
+    if is_gated(act_type):
+        act_func_map = {
+            ActivationType.ReGLU: torch.nn.functional.relu,
+            ActivationType.GEGLU: lambda x: torch.nn.functional.gelu(x, approximate="tanh"),
+            ActivationType.SiGLU: torch.nn.functional.silu,
+        }
+
+        act_act = out_states[..., ::2]
+        act_linear = out_states[..., 1::2]
+
+        act_act = act_func_map[act_type](act_act)
+        out_states = act_act * act_linear
+    else:
+        act_func_map = {
+            ActivationType.RELU: torch.nn.functional.relu,
+            ActivationType.GELU: torch.nn.functional.gelu,
+            ActivationType.SILU: torch.nn.functional.silu,
+            ActivationType.IDENTITY: lambda x: x,
+        }
+
+        out_states = act_func_map[act_type](out_states)
+    return out_states.to(dtype)
+
+
+def _fp6_quant_dequant_weights(weight: torch.Tensor) -> torch.Tensor:
+    from deepspeed.inference.v2.modules.implementations.linear.quantized_linear import fp_quantize
+    weight_quantized_fake_fp6, scales = fp_quantize(weight, num_bits=6, exp_bits=3)
+    return weight_quantized_fake_fp6 * scales
+
+
+def quant_dequant_implementation(hidden_states: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor],
+                                 act_type: ActivationType) -> torch.Tensor:
+    dtype = hidden_states.dtype
+    weight_dequantized = _fp6_quant_dequant_weights(weight)
+    out_states = torch.nn.functional.linear(hidden_states, weight_dequantized, bias)
+    out_states.float()
+
+    if is_gated(act_type):
+        act_func_map = {
+            ActivationType.ReGLU: torch.nn.functional.relu,
+            ActivationType.GEGLU: lambda x: torch.nn.functional.gelu(x, approximate="tanh"),
+            ActivationType.SiGLU: torch.nn.functional.silu,
+        }
+
+        act_act = out_states[..., ::2]
+        act_linear = out_states[..., 1::2]
+
+        act_act = act_func_map[act_type](act_act)
+        out_states = act_act * act_linear
+    else:
+        act_func_map = {
+            ActivationType.RELU: torch.nn.functional.relu,
+            ActivationType.GELU: torch.nn.functional.gelu,
+            ActivationType.SILU: torch.nn.functional.silu,
+            ActivationType.IDENTITY: lambda x: x,
+        }
+
+        out_states = act_func_map[act_type](out_states)
+    return out_states.to(dtype)
+
+
+def _fp6_quantized_linear_helper(tokens: int,
+                                 in_channels: int,
+                                 out_channels: int,
+                                 dtype: DtypeEnum,
+                                 act_fn: ActivationType,
+                                 use_bias: bool = True,
+                                 expect_failure: bool = False) -> None:
+    # The current FP6 kernel only supports NVIDIA Ampere GPUs.
+    if not 'cuda' in get_accelerator().current_device_name():
+        return
+    major, _ = torch.cuda.get_device_capability()  #ignore-cuda
+    if major != 8:
+        return
+
+    # Input vals
+    hidden_states = torch.randn(
+        (tokens, in_channels), dtype=dtype.value, device=get_accelerator().current_device_name()) * .01
+
+    weight_out_channels = 2 * \
+        out_channels if is_gated(act_fn) else out_channels
+    weight = torch.randn(
+        (weight_out_channels, in_channels), dtype=dtype.value, device=get_accelerator().current_device_name()) * .01
+    if use_bias:
+        bias = torch.randn(
+            (weight_out_channels), dtype=dtype.value, device=get_accelerator().current_device_name()) * .01
+    else:
+        bias = None
+
+    # quantize and dequantize output
+    ref_quant_dequant_output = quant_dequant_implementation(hidden_states, weight, bias, act_fn)
+
+    linear_config = DSLinearConfig(max_tokens=2048,
+                                   in_channels=in_channels,
+                                   out_channels=out_channels,
+                                   activation=act_fn,
+                                   input_dtype=dtype,
+                                   output_dtype=dtype)
+    bundle = ConfigBundle(name='quantized_wf6af16_linear', config=linear_config)
+    fp6_linear_module = DSLinearRegistry.instantiate_config(bundle)
+    weight_fp6 = fp6_linear_module.transform_param(weight.clone().cpu()).to(get_accelerator().current_device_name())
+
+    if expect_failure:
+        with pytest.raises(ValueError) as excinfo:
+            ds_output = fp6_linear_module(hidden_states, weight_fp6, bias)
+        assert "The out and in channel should be multiple of 256 and 64 respectively." in str(excinfo.value)
+    else:
+        ds_output = fp6_linear_module(hidden_states, weight_fp6, bias)
+        # The current FP6 kernel uses FP16 Tensor Core.
+        tolerances = (3e-2, 2e-3)  # tolerances for fp16
+
+        # Check DeepSpeed implementation
+        assert allclose(ds_output, ref_quant_dequant_output, tolerances=tolerances)
+
+
+all_acts = [
+    ActivationType.RELU,
+    ActivationType.GELU,
+    ActivationType.SILU,
+    ActivationType.GEGLU,
+    ActivationType.ReGLU,
+    ActivationType.SiGLU,
+]
+all_tokens = [1, 37]
+all_in_out_channels = [
+    (4096, 4096),
+    (8192, 28672),
+]
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("tokens", all_tokens)
+@pytest.mark.parametrize("in_channels, out_channels", all_in_out_channels)
+@pytest.mark.parametrize("act_fn", all_acts)
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_fp6_quantized_linear_act_fn(tokens: int, in_channels: int, out_channels: int, act_fn: ActivationType,
+                                     use_bias: bool) -> None:
+    _fp6_quantized_linear_helper(tokens=tokens,
+                                 in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 dtype=DtypeEnum.fp16,
+                                 act_fn=act_fn,
+                                 use_bias=use_bias)
+
+
+# Other shapes, not supported by FP6 kernels. Will raise ValueError.
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("tokens", all_tokens)
+@pytest.mark.parametrize("in_channels, out_channels", [(4608, 1728)])
+@pytest.mark.parametrize("act_fn", all_acts)
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_fp6_quantized_linear_act_fn_fail(tokens: int, in_channels: int, out_channels: int, act_fn: ActivationType,
+                                          use_bias: bool) -> None:
+    _fp6_quantized_linear_helper(tokens=tokens,
+                                 in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 dtype=DtypeEnum.fp16,
+                                 act_fn=act_fn,
+                                 use_bias=use_bias,
+                                 expect_failure=True)
diff --git a/tests/unit/inference/v2/ragged/test_manager_configs.py b/tests/unit/inference/v2/ragged/test_manager_configs.py
index bdd513445ddb..a5f270cced8c 100644
--- a/tests/unit/inference/v2/ragged/test_manager_configs.py
+++ b/tests/unit/inference/v2/ragged/test_manager_configs.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from pydantic import ValidationError
+from deepspeed.pydantic_v1 import ValidationError
 
 from deepspeed.inference.v2.ragged import DSStateManagerConfig
 
diff --git a/version.txt b/version.txt
index c37136a84824..930e3000bdc9 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.13.5
+0.14.1