diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
index e03cbb353bd3..ccb6c25e14f7 100644
--- a/.github/workflows/nv-pre-compile-ops.yml
+++ b/.github/workflows/nv-pre-compile-ops.yml
@@ -33,7 +33,7 @@ jobs:
             #python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
         - name: Compile DeepSpeed Ops
           run: |
-            TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 pip3 install .
+            TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
         - name: DS Report
           run: |
              ds_report
diff --git a/.github/workflows/nv-torch110-p40.yml b/.github/workflows/nv-torch110-p40.yml
index 95fccb2de9d3..45f3e0438233 100644
--- a/.github/workflows/nv-torch110-p40.yml
+++ b/.github/workflows/nv-torch110-p40.yml
@@ -3,6 +3,7 @@ name: nv-torch110-p40
 on:
   schedule:
     - cron: "0 0 * * *"
+  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
diff --git a/.github/workflows/nv-torch110-v100.yml b/.github/workflows/nv-torch110-v100.yml
index a3e39a9e5b22..1fd8aaac0ffa 100644
--- a/.github/workflows/nv-torch110-v100.yml
+++ b/.github/workflows/nv-torch110-v100.yml
@@ -3,6 +3,7 @@ name: nv-torch110-v100
 on:
   schedule:
     - cron: "0 0 * * *"
+  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 36fa34a42744..6b11b3acba51 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -58,7 +58,7 @@ repos:
             # Do not check files that are automatically generated
             '--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json',
             '--ignore-regex=\\n',  # Do not count the 'n' in an escaped newline as part of a word
-            '--ignore-words-list=youn,unsupport',  # Word used in error messages that need rewording
+            '--ignore-words-list=youn,unsupport,noe',  # Word used in error messages that need rewording
             --check-filenames,
             --check-hidden
         ]
diff --git a/README.md b/README.md
index 4999a485f4ce..6aef71b8e66e 100755
--- a/README.md
+++ b/README.md
@@ -15,11 +15,11 @@
 ## Latest News
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
+* [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[DeepSpeed4Science website](https://deepspeed4science.ai/)] [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]
 * [2023/08] [DeepSpeed ZeRO-Inference: 20X faster inference through weight quantization and KV cache offloading](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md)
 * [2023/08] [DeepSpeed-Chat: Llama/Llama-2 system support, efficiency boost, and training stability improvements](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31/README.md)
-* [2023/08] [DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses)
+* [2023/08] [DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses) [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/japanese/README.md)]
 * [2023/06] [ZeRO++: A leap in speed for LLM and chat model training with 4X less communication](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/)[[English](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/japanese/README.md)]
-* [2023/04] 🚀 [DeepSpeed Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/japanese/README.md)]🚀
 
 ---
 
@@ -35,9 +35,9 @@
 
 ---
 
-# DeepSpeed's three innovation pillars
+# DeepSpeed's four innovation pillars
 
-<img src="docs/assets/images/3pillars.png" width="800px">
+<img src="docs/assets/images/DeepSpeed-pillars.png" width="800px">
 
 
 ## DeepSpeed-Training
@@ -53,6 +53,10 @@ DeepSpeed brings together innovations in parallelism technology such as tensor,
 
 To further increase the inference efficiency, DeepSpeed offers easy-to-use and flexible-to-compose compression techniques for researchers and practitioners to compress their models while delivering faster speed, smaller model size, and significantly reduced compression cost. Moreover, SoTA innovations on compression like ZeroQuant and XTC are included under the compression pillar. Learn more: [DeepSpeed-Compression](https://www.deepspeed.ai/compression)
 
+## DeepSpeed4Science
+
+In line with Microsoft's mission to solve humanity's most pressing challenges, the DeepSpeed team at Microsoft is responding to this opportunity by launching a new initiative called *DeepSpeed4Science*, aiming to build unique capabilities through AI system technology innovations to help domain experts to unlock today's biggest science mysteries. Learn more: [DeepSpeed4Science website](https://deepspeed4science.ai/) and [tutorials](https://www.deepspeed.ai/deepspeed4science/)
+
 ---
 
 # DeepSpeed Software Suite
diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py
index fe0e66768d45..a87ff3c1d223 100644
--- a/accelerator/abstract_accelerator.py
+++ b/accelerator/abstract_accelerator.py
@@ -185,6 +185,10 @@ def lazy_call(self, callback):
     def communication_backend_name(self):
         ...
 
+    @abc.abstractmethod
+    def is_triton_supported(self):
+        ...
+
     # Tensor operations
     @property
     @abc.abstractmethod
diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
index 11518d31e069..4de4ad93c2bb 100644
--- a/accelerator/cpu_accelerator.py
+++ b/accelerator/cpu_accelerator.py
@@ -182,6 +182,9 @@ def lazy_call(self, callback):
     def communication_backend_name(self):
         return self._communication_backend_name
 
+    def is_triton_supported(self):
+        return False
+
     # Data types
     def is_bf16_supported(self):
         return True
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
index 9c1e0d22785e..045cce510a90 100644
--- a/accelerator/cuda_accelerator.py
+++ b/accelerator/cuda_accelerator.py
@@ -173,6 +173,13 @@ def lazy_call(self, callback):
     def communication_backend_name(self):
         return self._communication_backend_name
 
+    def is_triton_supported(self):
+        major, _ = torch.cuda.get_device_capability()
+        if major >= 8:
+            return True
+        else:
+            return False
+
     # Tensor operations
 
     @property
diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py
index 68e3b6df05c9..63a92f250898 100644
--- a/accelerator/mps_accelerator.py
+++ b/accelerator/mps_accelerator.py
@@ -138,6 +138,9 @@ def is_bf16_supported(self):
     def is_fp16_supported(self):
         return False
 
+    def supported_dtypes(self):
+        return [torch.float]
+
     # Misc
     def amp(self):
         return
@@ -157,6 +160,9 @@ def lazy_call(self, callback):
     def communication_backend_name(self):
         return self._communication_backend_name
 
+    def is_triton_supported(self):
+        return False
+
     # Tensor operations
     @property
     def BFloat16Tensor(self):
diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py
index 5678a0266386..206bc1dfaa1b 100644
--- a/accelerator/npu_accelerator.py
+++ b/accelerator/npu_accelerator.py
@@ -158,6 +158,9 @@ def lazy_call(self, callback):
     def communication_backend_name(self):
         return self._communication_backend_name
 
+    def is_triton_supported(self):
+        return False
+
     # Tensor operations
 
     @property
diff --git a/blogs/deepspeed4science/README.md b/blogs/deepspeed4science/README.md
new file mode 100644
index 000000000000..2a80ea2e749e
--- /dev/null
+++ b/blogs/deepspeed4science/README.md
@@ -0,0 +1,7 @@
+<div align="center">
+
+# Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies
+
+</div>
+
+[https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)
diff --git a/blogs/deepspeed4science/chinese/README.md b/blogs/deepspeed4science/chinese/README.md
new file mode 100644
index 000000000000..3ffddfb16fe5
--- /dev/null
+++ b/blogs/deepspeed4science/chinese/README.md
@@ -0,0 +1,145 @@
+<div align="center">
+
+# DeepSpeed4Science：利用先进的AI系统优化技术实现科学发现
+
+</div>
+
+*此博客为英文博客[Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)的官方翻译*
+
+<div align="center">
+<img src="../media/Figure1.png" width="800px" alt="" />
+
+*图1：DeepSpeed4Science方法概述：专为加速科学发现和应对其复杂性而量身定制的AI系统技术开发。*
+</div>
+
+## 简介
+
+在接下来的十年中，深度学习可能会彻底改变自然科学，增强我们对自然现象进行建模和预测的能力。这可能预示着科学探索的新时代，为从药物开发到可再生能源的各个领域带来重大进展。为了响应这一机会以及微软“予力全球每一人、每一组织，成就不凡”的使命，[微软DeepSpeed团队](https://www.deepspeed.ai/)启动了一个名为[DeepSpeed4Science](https://deepspeed4science.ai/)的新计划，旨在通过AI系统技术创新帮助领域专家解锁当今最大的科学之谜。
+
+[DeepSpeed](https://www.deepspeed.ai/)系统是由微软开发的业界领先的开源AI系统框架，它为各种AI硬件上的深度学习训练和推理提供了前所未有的规模和速度。图1展示了我们对DeepSpeed4Science这一新计划的基本方法。通过利用DeepSpeed当前的技术方案（训练、推理和压缩）作为基础技术推动器，DeepSpeed4Science将创建一套专为加速科学发现而量身定制的AI系统技术，以应对其独特的复杂性，超越用于加速通用大型语言模型（LLMs）的常见技术方法。我们与拥有科学AI模型的内部和外部团队紧密合作，以发现和解决领域特定AI系统的挑战。这包括气候科学、药物设计、生物学理解、分子动力学模拟、癌症诊断和监测、催化剂/材料发现、和其他领域。
+
+我们的长期愿景是将DeepSpeed4Science发展成一个用于分享支持科学发现的先进AI技术的软件平台和统一代码仓库。DeepSpeed4Science的设计旨在包容性，呼应微软的[“AI for Good”承诺](https://www.microsoft.com/en-us/ai/ai-for-good)。这体现在该计划对一系列标志性科学模型的支持上，他们代表了一些最关键的AI4Science应用场景。在这篇博客中，我们展示了DeepSpeed4Science如何帮助解决结构生物学研究中的两个关键AI系统挑战：(1) 解决了以Evoformer为中心的蛋白质结构预测模型中的内存爆炸问题，以及(2)为更好地理解引发大流行的病毒的进化提供AI模型长序列支持。
+
+## 我们的初期主要合作者
+
+DeepSpeed4Science的新系统技术可以用于很多推动科学边界的标志性模型，赋能AI驱动的科学发现。目前，DeepSpeed4Science很荣幸地支持来自[微软研究院AI4Science](https://www.microsoft.com/en-us/research/lab/microsoft-research-ai4science/)、[微软WebXT/Bing](https://www.msn.com/en-us/weather/forecast/)、[美国能源部国家实验室](https://www.energy.gov/national-laboratories)和多所大学的几个关键科学模型。
+
+### 微软内部合作伙伴
+
+#### 科学基础模型（Scientific Foundation Model，SFM），微软研究院AI4Science
+
+<div align="center">
+<img src="../media/Figure2-1.jpg" width="800px" alt="" />
+<img src="../media/Figure2-2.gif" width="800px" alt="" />
+
+*图2：科学基础模型（Scientific Foundation Model，SFM）及其当前探索：Distributional Graphormer。*
+</div>
+
+科学基础模型（SFM）旨在创建一个统一的大规模基础模型，以支持自然科学发现，支持多种输入、多个科学领域（例如，药物、材料、生物学、健康等）和计算任务。DeepSpeed4Science合作伙伴关系将为SFM团队提供新的训练和推理技术，以支持他们的新生成AI方法（例如[Distributional Graphormer](https://www.microsoft.com/en-us/research/blog/distributional-graphormer-toward-equilibrium-distribution-prediction-for-molecular-systems/)）这样的项目进行持续研究。
+
+#### ClimaX，微软研究院AI4Science
+
+<div align="center">
+<img src="../media/Figure3.png" width="800px" alt="" />
+
+*图3：ClimaX是第一个设计用于执行各种天气和气候建模任务的基础模型。*
+</div>
+
+我们的气候正在发生变化，导致极端天气事件的频率增加。为了减轻负面影响，预测这些事件将发生的地方变得越来越重要。[ClimaX](https://www.microsoft.com/en-us/research/group/autonomous-systems-group-robotics/articles/introducing-climax-the-first-foundation-model-for-weather-and-climate/)是第一个设计用于执行各种天气和气候建模任务的基础模型。它可以吸收许多具有不同变量和分辨率的数据集以提高天气预报的准确性。DeepSpeed4Science正在为ClimaX创建新的系统支持和加速策略，以高效地预训练/微调更大的基础模型，同时处理非常大的高分辨率图像数据（例如，数十到数百PB）和长序列。
+
+#### AI驱动的第一性原理分子动力学（AI Powered Ab Initio Molecular Dynamics，AI<sup>2</sup>MD），微软研究院AI4Science
+
+<div align="center">
+<img src="../media/Figure4.gif" width="800px" alt="" />
+
+*图4：一百万步的分子动力学模拟：RBD-蛋白（RBD-protein）与蛋白抑制剂（protein inhibitor）相互作用。*
+</div>
+
+这个项目模拟了使用[AI驱动的力场模型](https://www.microsoft.com/en-us/research/publication/ai2bmd-efficient-characterization-of-protein-dynamics-with-ab-initio-accuracy/)进行近似第一性原理计算精度的大型（百万原子）分子系统的动态模拟，同时保持了经典分子动力学的效率和可扩展性。这些模拟足够高效，可以生成足够长的轨迹来观察化学上有意义的事件。通常，这个过程需要数百万甚至数十亿的推理步骤。这对优化图神经网络（GNN）+ LLM模型的推理速度提出了重大挑战，DeepSpeed4Science将为此提供新的加速策略。
+
+#### 微软天气，微软WebXT/Bing
+
+<div align="center">
+<img src="../media/Figure5.gif" width="800px" alt="" />
+
+*图5：微软降水预报（每4分钟一次对接下来4小时进行预测）。*
+</div>
+
+[微软天气](https://www.msn.com/en-us/weather/forecast/)提供精确的天气信息，[帮助用户为他们的生活方式、健康、工作和活动做出更好的决策](https://blogs.windows.com/windowsexperience/2022/08/31/microsoft-joins-noaas-weather-ready-nation-ambassador-initiative-to-help-improve-americas-readiness-and-response-to-weather-events/)——包括每小时多次更新的准确的10天全球天气预报。此前，微软天气受益于DeepSpeed技术，加速了他们的多GPU训练环境。目前，DeepSpeed4Science正在与微软WebXT天气预报团队合作，进一步增强微软天气预报服务的最新功能和改进。
+
+### 外部合作者
+
+DeepSpeed4Science的旅程始于两个开创性的基于LLM的结构生物学研究AI模型：来自哥伦比亚大学的[OpenFold](https://openfold.io/)，一个开源的高保真蛋白质结构预测模型；以及来自[阿贡国家实验室](https://www.anl.gov/)的[GenSLMs](https://github.com/ramanathanlab/genslm)，一个获得[ACM戈登贝尔奖](https://www.acm.org/media-center/2022/november/gordon-bell-special-prize-covid-research-2022)的用于学习SARS-CoV-2（COVID-19）基因组的进化的语言模型。作为此次发布的特色展示，它们代表了当今AI驱动的结构生物学研究面临的两个常见AI系统挑战。我们将在下一节中讨论DeepSpeed4Science如何赋能这些科学研究。
+
+此外，DeepSpeed4Science最近扩大了其范围，以支持更多样的科学模型。例如，在我们与阿贡国家实验室合作训练[Aurora Exascale系统](https://www.anl.gov/aurora)上的万亿参数科学模型的工作中，DeepSpeed4Science技术将帮助他们达到这一关键任务所需的性能要求和可扩展性。此外，通过与[橡树岭国家实验室](https://ai-roadmap.ornl.gov/)和[国家癌症研究所（NCI）](https://www.cancer.gov/)合作进行癌症监测，DeepSpeed4Science将帮助从非结构化的临床文本中高保真地提取和分类信息，以供[MOSSAIC项目](https://www.olcf.ornl.gov/tag/mossaic/)使用。[Brookhaven国家实验室](https://www.bnl.gov/world/)还将采用DeepSpeed4Science技术，支持使用LLMs开发大型数字双胞胎模型，以便为清洁能源研究产生更真实的模拟数据。您可以在[deepspeed4science.ai](https://deepspeed4science.ai/)上找到有关我们外部合作者及其科学任务的更多详细信息。
+
+## 合作展示
+
+### 展示（I）：DeepSpeed4Science通过DS4Sci_EvoformerAttention消除以Evoformer为中心的结构生物学模型的内存爆炸问题
+
+<div align="center">
+<img src="../media/Figure6-1.png" width="800px" alt="" />
+<img src="../media/Figure6-2.gif" width="800px" alt="" />
+
+*图6：在训练过程中OpenFold对PDB链7B3A_A的预测。*
+</div>
+
+[OpenFold](https://github.com/aqlaboratory/openfold)是DeepMind的[AlphaFold2](https://alphafold.com/)的开源社区再现，使其可以在新数据集上训练或微调AlphaFold2。研究人员已经使用它从头开始重新训练AlphaFold2，生成新的模型参数集，研究AlphaFold2的早期训练阶段（图6），并开发新的蛋白质折叠系统。
+
+<div align="center">
+<img src="../media/Figure7.jpg" width="600px" alt="" />
+
+*图7：在OpenFold中，对多序列比对（MSA）Attention内核（包含偏差）变体的训练峰值内存需求。 (左) 使用在AlphaFold2中的EvoformerAttention的原始OpenFold实现。对于这些类型的蛋白质结构预测模型，在训练/推理中的内存爆炸问题是常见的。最先进的FlashAttention无法有效支持这样的Attention变体。 (右) DeepSpeed4Science的一种新解决方案DS4Sci_EvoformerAttention在不影响模型品质的条件下显著地减少了OpenFold的训练峰值内存需求（最多13倍）。*
+</div>
+
+尽管OpenFold有使用最先进的系统技术进行性能和内存优化，但从头开始训练AlphaFold2仍然在计算上很昂贵。目前阶段的模型参数很小，只有9300万个参数，但它包含了几个需要非常大的中间内存的特殊Attention变体。在标准AlphaFold2训练的“微调”阶段，只是这些变体中的其中一个在半精度下就生成了超过12GB的张量，使其峰值内存要求远远超过了相同大小的语言模型。即使使用像activation checkpointing和DeepSpeed ZeRO优化这样的技术，这种内存爆炸问题仍然严重限制了可训练模型的序列长度和MSA深度。此外，近似策略可能会显著影响模型的准确性和收敛性，同时仍然导致内存爆炸，如图7左侧（橙色）所示。
+
+为了应对结构生物学研究（例如，蛋白质结构预测和平衡分布预测）中的这一常见系统挑战，DeepSpeed4Science通过为这类科学模型中广泛出现的注意力变体（即EvoformerAttention）设计定制的精确注意力内核来解决这一内存效率问题。具体来说，我们设计了一套由复杂的融合/矩阵分块策略和动态内存减少方法而组成的高内存效率DS4Sci_EvoformerAttention内核，作为高质量机器学习模块供更广泛的生物学研究社区使用。通过整合到OpenFold中，这些定制内核在训练期间提供了显著的加速，并显著减少了模型的训练和推理的峰值内存需求。这使得OpenFold可以用更大、更复杂的模型，使用更长的序列在更广泛的硬件上进行实验。关于这项技术的详细信息可以在[这里](https://deepspeed4science.ai/2023/09/18/model-showcase-openfold/)找到。
+
+### 展示（II）：DeepSpeed4Science通过系统和算法方法为基因组基础模型（例如，GenSLMs）提供长序列支持
+
+<div align="center">
+<img src="../media/Figure8.gif" width="800px" alt="" />
+
+*图8：GenSLMs：获2022年ACM 戈登贝尔奖的COVID基因组模型（基于GPT-NeoX的25B/33B模型）。它用于学习描述SARS-CoV-2基因组生物学意义的潜在空间。这个GIF展示了一个重要的蛋白质家族苹果酸脱氢酶（malate dehydrogenase）的根据重要特征（如序列长度和GC含量（核酸鸟嘌呤和胞嘧啶的含量与腺嘌呤和胸腺嘧啶的比率。它测量DNA链抵抗热的能力））着色的潜在空间的投影。*
+</div>
+
+[GenSLMs](https://github.com/ramanathanlab/genslm)，一个来自阿贡国家实验室的[2022年ACM 戈登贝尔奖获奖](https://www.acm.org/media-center/2022/november/gordon-bell-special-prize-covid-research-2022)的基因组模型，可以通过大型语言模型（LLMs）的基因组数据训练来学习SARS-CoV-2（COVID-19）基因组的进化。它旨在改变如何识别和分类引发大流行的病毒（特别是SARS-CoV-2）的新变种。GenSLMs代表了第一批可以泛化到其他预测任务的基因组基础模型。对潜在空间的良好理解可以帮助GenSLMs处理超出仅仅是病毒序列的新领域，并扩展它们模拟细菌病原体甚至真核生物的能力（例如，理解功能、途径成员资格和进化关系等事物）。为了实现这一科学目标，GenSLMs和类似的模型需要非常长的序列支持用于训练和推理，这超出了像[FlashAttention](https://arxiv.org/abs/2307.08691)这样的通用LLM的长序列策略。通过DeepSpeed4Science的新设计，科学家现在可以构建和训练具有显著更长的上下文窗口的模型，允许他们探索以前无法访问的关系。
+
+<div align="center">
+<img src="../media/Figure9.png" width="1000px" alt="" />
+
+*图9：由不同框架在不同规模下支持的两个GenSLMs模型的最大序列长度。使用NVIDIA DGX，每个节点有八个40G A100 GPU。*
+</div>
+
+具体在系统层面，我们发布了包括[长序列支持和其他新优化](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support)的最新的[Megatron-DeepSpeed框架](https://github.com/microsoft/Megatron-DeepSpeed)。科学家现在可以通过我们新添加的内存优化技术（如注意力掩码异步处理和位置码分割）、张量并行、流水线并行、序列并行、基于ZeRO的数据并行和模型状态异步处理等技术的协同组合，用更长的序列训练他们的GenSLMs等大型科学模型。图9展示了我们的新版本使GenSLMs的25B和33B模型的最长序列长度分别比之前的Megatron-DeepSpeed版本增加了12倍和14倍。在支持的序列长度方面，这个新Megatron-DeepSpeed框架也显著地超过了NVIDIA的Megatron-LM（对于25B和33B模型分别高达9.8倍和9.1倍）。例如，阿贡实验室团队的GenSLMs 25B模型在64个GPU上的原始序列长度为42K，而现在可以用512K的核苷酸序列进行训练。这在不损失准确性的条件下大大提高了模型质量和科学发现的范围。对于那些更喜欢相对位置编码技术这样的算法策略的领域科学家，这个[新版本](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/)也进行了集成。
+
+## 总结和路线图
+
+我们非常自豪和兴奋地宣布DeepSpeed4Science计划以及几个研发亮点和成果。从今天开始，我们将在[deepspeed4science.ai](https://deepspeed4science.ai/)上介绍我们的新计划，包括关于我们的外部合作者的信息，以及当前和未来的DeepSpeed4Science技术发布。我们的一个高层次目标是推广广泛解决大规模科学发现的主要系统痛点的AI系统技术。我们希望全球的科学家们能够从DeepSpeed4Science通过开源软件解锁的新功能中受益。我们期待更好地了解阻碍您的科学发现的AI系统设计挑战。我们真诚地欢迎您的参与，帮助构建一个更有前途的AI4Science未来。请给我们发送电子邮件至<deepspeed-info@microsoft.com>。我们鼓励您在我们的[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/)上报告问题、贡献PR、参与讨论。
+
+## 致谢
+
+**Core DeepSpeed4Science Team:**
+
+Shuaiwen Leon Song (DeepSpeed4Science lead), Minjia Zhang, Conglong Li, Shiyang Chen, Chengming Zhang, Xiaoxia (Shirley) Wu, Masahiro Tanaka, Martin Cai, Adam Graham, Charlie Zhou, Yuxiong He (DeepSpeed team lead)
+
+**Our Founding Collaborators (in alphabetical order):**
+
+**Argonne National Lab team:** Rick Stevens, Cristina Negri, Rao Kotamarthi, Venkatram Vishwanath, Arvind Ramanathan, Sam Foreman, Kyle Hippe, Troy Arcomano, Romit Maulik, Maxim Zvyagin, Alexander Brace, Yuntian Deng, Bin Zhang, Cindy Orozco Bohorquez, Austin Clyde, Bharat Kale, Danilo Perez-Rivera, Heng Ma, Carla M. Mann, Michael Irvin, J. Gregory Pauloski, Logan Ward, Valerie Hayot, Murali Emani, Zhen Xie, Diangen Lin, Maulik Shukla, Weili Nie, Josh Romero, Christian Dallago, Arash Vahdat, Chaowei Xiao, Thomas Gibbs, Ian Foster, James J. Davis, Michael E. Papka, Thomas Brettin, Anima Anandkumar
+
+**AMD:** Ivo Bolsen, Micheal Schulte, Bo Begole, Angela Dalton, Steve Reinhart, Ashwin Aji, Jalal Mahmud, Mahesh Balashibramanian
+
+**Brookhaven National Lab team:** Adolfy Hoisie, Shinjae Yoo, Yihui Ren.
+
+**Columbia University OpenFold team:** Mohammed AlQuraishi, Gustaf Ahdritz
+
+**Microsoft Research AI4Science team:** Christopher Bishop, Bonnie Kruft, Max Welling, Tie-Yan Liu, Christian Bodnar, Johannes Brandsetter, Wessel Bruinsma, Chan Cao, Yuan-Jyue Chen, Peggy Dai, Patrick Garvan, Liang He, Elizabeth Heider, PiPi Hu, Peiran Jin, Fusong Ju, Yatao Li, Chang Liu, Renqian Luo, Qi Meng, Frank Noe, Tao Qin, Janwei Zhu, Bin Shao, Yu Shi, Wenlei Shi, Gregor Simm, Megan Stanley, Lixin Sun, Yue Wang, Tong Wang, Zun Wang, Lijun Wu, Yingce Xia, Leo Xia, Shufang Xie, Shuxin Zheng, Jianwei Zhu
+
+**Oakridge National Lab team:** Prassana Balaprakash, Georgia Tourass
+
+**Princeton University:** William Tang, Kyle Felker, Alexey Svyatkovskiy (Microsoft liaison)
+
+**Rutgers University:** Hang Liu
+
+**WebXT Weather team:** Pete Luferenko, Divya Kumar, Jonathan Weyn, Ruixiong Zhang, Sylwester Klocek, Volodymyr Vragov
diff --git a/blogs/deepspeed4science/japanese/README.md b/blogs/deepspeed4science/japanese/README.md
new file mode 100644
index 000000000000..80fc137e16bb
--- /dev/null
+++ b/blogs/deepspeed4science/japanese/README.md
@@ -0,0 +1,145 @@
+<div align="center">
+
+# DeepSpeed4Scienceイニシアティブ: 洗練されたAIシステムのテクノロジーにより大規模な科学的発見を可能に
+
+</div>
+
+*こちらは英語ブログ[Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)の公式の翻訳です*
+
+<div align="center">
+<img src="../media/Figure1.png" width="800px" alt="" />
+
+*図1：DeepSpeed4Scienceのアプローチ: 汎用の言語モデルのサポートを超え、科学的発見とその複雑さの解決に特化したAI技術を開発*
+</div>
+
+## はじめに
+
+自然の出来事をモデル化し予測する深層学習の能力は急速に高まっており、次の１０年間に、自然科学に革命を起こすかも知れません。薬の開発から再生可能エネルギーまでの各セクターで、大きな進展をもたらす新しい科学的探求の時代が到来するでしょう。「地球上のすべての人と組織がもっと多くのことを成し遂げられるようにする」というMicrosoftのミッションに従い、この機会に、[DeepSpeedチーム](https://www.deepspeed.ai/)では[DeepSpeed4Science](https://deepspeed4science.ai/)という新しいイニシアティブを立ち上げました。これは、AIシステム技術のイノベーションを通じて他に類を見ない技術を構築し、様々な分野の専門家が、科学分野における大きな謎を解き明かす手助けをすることを目指しています。
+
+[DeepSpeed](https://www.deepspeed.ai/)システムは、Microsoftが開発した、AI分野をリードするオープンソースのAIシステムのフレームワークであり、多様なAIハードウェア上での深層学習の訓練と推論において、前例のない規模と速度を実現します。図1は、この新しいDeepSpeed4Scienceイニシアティブでの基本的なアプローチを示しています。DeepSpeedの現在の柱となる技術（訓練、推論、圧縮）を基盤として活用しつつ、DeepSpeed4Scienceでは、大規模言語モデル（LLM）を加速するための汎用の技術的アプローチを超え、科学的発見を加速する目的で新たに構築された、一連のAIシステム技術を提供します。私たちは、重要な科学的ミッションを推進している、代表的な科学分野向けAIモデルを所有する内外のチームと連携し、ドメイン固有のAIシステムの課題を特定し、解決していきます。これには、気候科学、薬物設計、生物学的理解、分子動力学シミュレーション、がんの診断と監視、触媒/材料の発見、およびその他の分野が含まれます。
+
+私たちの長期的なビジョンは、DeepSpeed4Scienceを、科学的発見をサポートする先進的なAIシステム技術を共有するための新しいソフトウェアプラットフォームおよび統一的なリポジトリに発展させることです。DeepSpeed4Scienceは、Microsoftの[AI for Good](https://www.microsoft.com/en-us/ai/ai-for-good)のコミットメントを反映して、包括的に設計されています。このことは、AI4Scienceへのもっとも重要な投資の成果として構築された、様々な代表的モデルへの、DeepSpeed4Scienceイニシアティブによるサポートに現れています。このブログでは、DeepSpeed4Scienceが、構造生物学の研究における2つの重要なシステムの課題にどのように対処するかを紹介します：(1) Evoformer中心のタンパク質構造予測モデルをスケールアップする際に極めて大きなメモリが必要となる問題を解決し、(2) パンデミックを引き起こすウイルスの進化の様子をよりよく理解するための非常に長いシーケンスのサポートを可能にします。
+
+## 主要な初期コラボレータ
+
+DeepSpeed4Scienceによる新しいシステム技術はAI駆動の幅広い科学研究を強化するものです。現在、DeepSpeed4Scienceは、[Microsoft Research AI4Science](https://www.microsoft.com/en-us/research/lab/microsoft-research-ai4science/)、[Microsoft WebXT/Bing](https://www.msn.com/en-us/weather/forecast/)、[U.S. DoE National Labs](https://www.energy.gov/national-laboratories)、および複数の大学のいくつかの重要な科学モデルをサポートしています。
+
+### Microsoft内のパートナーシップ
+
+#### 科学基盤モデル (Scientific Foundation Model, SFM), Microsoft Research AI4Science
+
+<div align="center">
+<img src="../media/Figure2-1.jpg" width="800px" alt="" />
+<img src="../media/Figure2-2.gif" width="800px" alt="" />
+
+*図2: 科学基盤モデル (Scientific foundation model, SFM) とその探索: Distributional Graphormer*
+</div>
+
+科学的基盤モデル(SFM)は、多様なインプット、複数の科学領域(薬物、材料、生物学、健康など)、および計算タスクをサポートする、自然科学的発見を強化するための統一された大規模基盤モデルを作成することを目的としています。DeepSpeed4Scienceパートナーシップは、[Distributional Graphormer](https://www.microsoft.com/en-us/research/blog/distributional-graphormer-toward-equilibrium-distribution-prediction-for-molecular-systems/)などのMicrosoftの新しい生成AI手法などのプロジェクトに関する、SFMチームの継続的な研究を強化するための新しい訓練および推論テクノロジーを提供します。
+
+#### ClimaX, Microsoft Research AI4Science
+
+<div align="center">
+<img src="../media/Figure3.png" width="800px" alt="" />
+
+*図3: 天気・気候の多様なモデリングタスクのための最初の基盤モデルClimaX*
+</div>
+
+気候の変化は、より頻繁な異常気象を引き起こしています。悪影響を軽減するため、これらのイベントが発生する場所を予測することがますます重要になっています。[ClimaX](https://www.microsoft.com/en-us/research/group/autonomous-systems-group-robotics/articles/introducing-climax-the-first-foundation-model-for-weather-and-climate/)は、さまざまな気象および気候モデリングタスクを実行するために設計された最初の基盤モデルです。さまざまな変数と解像度を持つ多くの異なるデータセットを扱えるため、天気予報の精度が向上する可能性があります。DeepSpeed4Scienceは、非常に大きな高解像度画像データ(数十から数百ペタバイトなど)を長いシーケンスで処理しながら、より大きな基盤モデルを効率的に事前訓練/ファインチューニングするためのClimaXの新しいシステムサポートを提供しています。
+
+#### AIを用いたAb Initio分子動力学法（AI Powered Ab Initio Molecular Dynamics，AI<sup>2</sup>MD），Microsoft Research AI4Science
+
+<div align="center">
+<img src="../media/Figure4.gif" width="800px" alt="" />
+
+*図4: 100万ステップの分子動力学シミュレーション: RBD-proteinとprotein inhibitorの相互作用*
+</div>
+
+このプロジェクトは、古典的な分子動力学の効率とスケーラビリティを維持しながら、[AIを利用した力場モデル](https://www.microsoft.com/en-us/research/publication/ai2bmd-efficient-characterization-of-protein-dynamics-with-ab-initio-accuracy/)を使用して、原理に基づく精度（ab initio accuracy）に近い精度で大規模(原子数で100万規模)な分子システムの力学をシミュレートします。このシミュレーションは、化学的に重要なイベントを観察するのに十分な長さの軌道を生成できる効率を実現しています。通常、このプロセスには数百万から数十億の推論ステップが必要です。これは、グラフニューラルネットワーク(GNN)+ LLMモデルの推論速度を最適化する上で大きな課題となります。DeepSpeed4Scienceは、この課題に対して、新しいシステムサポートを提供します。
+
+#### 天気 from Microsoft Start, Microsoft WebXT/Bing
+
+<div align="center">
+<img src="../media/Figure5.gif" width="800px" alt="" />
+
+*図5: Microsoft Startにおける降水予想 (次の４時間について４分ごと)*
+</div>
+
+[天気 from Microsoft Start](https://www.msn.com/en-us/weather/forecast/)は、[ユーザーがライフスタイル、健康、仕事、活動についてより適切な決定を下せるよう](https://blogs.windows.com/windowsexperience/2022/08/31/microsoft-joins-noaas-weather-ready-nation-ambassador-initiative-to-help-improve-americas-readiness-and-response-to-weather-events/)、正確な気象情報を提供します。 (1 時間ごとに複数回更新される、10 日間に渡る正確かつグローバルな天気予報など)。 以前にも、この天気予報は、DeepSpeedの技術を使用して、マルチ GPU を用いた訓練を高速化していました。現在、DeepSpeed4ScienceはMicrosoft WebXT気象チームと協力して、最先端の機能と更なる改善により、マイクロソフトの気象サービスをさらに強化しています。
+
+### 外部のコラボレータ
+
+DeepSpeed4Scienceは、構造生物学研究のための2つの先駆的なLLMベースのAIモデルを扱うことから始まりました: オープンソースのハイフィデリティタンパク質構造予測モデルであるコロンビア大学の[OpenFold](https://openfold.io/)と、SARS-CoV-2(COVID-19)ゲノムの進化を学習する、[Gordon Bell Special Prize](https://www.acm.org/media-center/2022/november/gordon-bell-special-prize-covid-research-2022)を受賞したゲノム用言語モデルである[アルゴンヌ国立研究所](https://www.anl.gov/)の[GenSLMs](https://github.com/ramanathanlab/genslm)です。次のセクションでは、今日のAI主導の構造生物学研究が直面している2つの一般的なAIシステムの課題を紹介し、DeepSpeed4Scienceが科学研究をどのように強化したかについて説明します。
+
+またDeepSpeed4Scienceは最近、より多様な科学モデルをサポートするために、その対象を拡大しました。たとえば、[Aurora Exascaleシステム](https://www.anl.gov/aurora)で、1兆パラメータの科学モデルを訓練するアルゴンヌ国立研究所との協力にあたって、DeepSpeed4Scienceテクノロジーは、求められるパフォーマンス要件とスケーラビリティを実現するのに重要な役割を果たします。さらに、DeepSpeed4Scienceは、がんの調査に関して、[オークリッジ国立研究所](https://ai-roadmap.ornl.gov/)および[国立がん研究所(NCI)](https://www.cancer.gov/)と協力することにより、[MOSSAICプロジェクト](https://www.olcf.ornl.gov/tag/mossaic/)の非構造化臨床テキストからの情報の高信頼度抽出と分類にも用いられます。さらに、DeepSpeed4Scienceのテクノロジーは、[ブルックヘブン国立研究所](https://www.bnl.gov/world/)にも採用され、LLMを使用してより現実的なシミュレーションデータを生成することにより、クリーンエネルギー研究用の大規模なデジタルツインモデルの開発をサポートします。外部のコラボレータとその科学ミッションに関するより詳細な情報は、[deepspeed4science.ai](https://deepspeed4science.ai/)に掲載しています。
+
+## パートナーシップの事例
+
+### 事例(I): DeepSpeed4ScienceのDS4Sci_EvoformerAttentionにより、Evoformerで構成された生物学モデルをスケールアップする際のメモリ問題を解決
+
+<div align="center">
+<img src="../media/Figure6-1.png" width="800px" alt="" />
+<img src="../media/Figure6-2.gif" width="800px" alt="" />
+
+*図6: モデル学習の進行に伴うPDB chain 7B3A_AについてのOpenFoldの予測*
+</div>
+
+[OpenFold](https://github.com/aqlaboratory/openfold)は、DeepMindによる[AlphaFold2](https://alphafold.com/)をオープンソースで再現したものであり、新しいデータセットでAlphaFold2を訓練またはファインチューニングすることを可能にします。研究者は、これを使用して、AlphaFold2をゼロから再訓練して新しいモデルパラメータを作成し、AlphaFold2の初期訓練フェーズを研究し(図6)、新しいタンパク質フォールディングシステムを開発しました。
+
+<div align="center">
+<img src="../media/Figure7.jpg" width="600px" alt="" />
+
+*図7: OpenFoldで可能な最大の訓練サンプル次元を持つ多重配列アライメント（MSA）アテンションカーネル（バイアス付き）のバリエーションを訓練するために必要なピークメモリ。(左）AlphaFold2で使用されているEvoformerAttentionを用いたオリジナルのOpenFold実装。この種のタンパク質構造予測モデルの訓練/推論では、極めて多くのメモリが必要とされることは一般的な課題となっている。特に、最新技術として広く知られるFlashAttentionでも、このような科学研究のためのアテンションのバリエーションを効果的にサポートできない。(右）DS4Sci_EvoformerAttentionと呼ばれるDeepSpeed4Scienceの新しい技術は、精度を落とすことなく、OpenFoldモデルの訓練に必要なピークメモリを1/13に大幅に削減する。*
+</div>
+
+OpenFoldには、最先端のシステムテクノロジーを使用したパフォーマンスとメモリの最適化が含まれていますが、AlphaFold2をゼロから訓練することは依然として大きな計算コストがかかります。現段階でのモデルは、パラメータ数の絶対値は小さい（9,300万個）のですが、極めて大きなアクティベーションを持つアテンションのバリエーションが含まれています。標準的なAlphaFold2訓練のファインチューニングフェーズでは、これらのバリエーションのうちのの1つが生成したロジットテンソル(入力としてモデルに供給されるディープタンパク質MSAに対応するように設計されたもの)は、半精度浮動小数で12GBを超え、同等のサイズの言語モデルが使用するメモリを大幅に上回ります。Activation checkpointingや、DeepSpeed ZeRO 最適化などの手法を使用しても、非常に多くのメモリが必要とされるため、モデルを訓練できるシーケンスの長さと MSA の深さが大幅に制限されます。さらに、近似解を与えるような戦略を用いると、モデルの精度と収束に大きな影響を与える可能性があり、それでもメモリが爆発的に増加します(図7の左側のバー(オレンジ色))。
+
+DeepSpeed4Scienceは、構造生物学研究(タンパク質構造予測や平衡分布予測など)におけるこの一般的なシステムの課題に対処するために、このカテゴリの科学モデルに広く見られるアテンションのバリエーション(つまりEvoformerAttention)用にカスタマイズされた正確なアテンションのカーネルを設計することにより、このメモリの非効率性の問題に対処しています。具体的には、高度なフュージョン/タイリング戦略とオンザフライのメモリ削減方法によって可能になるメモリ効率の高いDS4Sci_EvoformerAttentionカーネルのセットを、高品質の機械学習プリミティブとして、より広いコミュニティ向けに作成しました。これらをOpenFoldに組み込むことで、訓練中の速度が大幅に向上し、訓練と推論のためのモデルのピークメモリが大幅に削減されます。これにより、OpenFoldはより大きく、より複雑なモデル、より長いシーケンスで実験し、より幅広いハードウェアで訓練することができます。この技術の詳細については、[こちら](https://deepspeed4science.ai/2023/09/18/model-showcase-openfold/)をご覧ください。
+
+### 事例(II): DeepSpeed4Scienceのシステムとアルゴリズムの両方からのアプローチにより、ゲノム基盤モデルでの非常に長い系列の使用をサポート
+
+<div align="center">
+<img src="../media/Figure8.gif" width="800px" alt="" />
+
+*図8: GenSLMs：2022年ACM Gordon Bell Special Prize受賞COVIDゲノム用モデル（GPT-NeoXに基づく25B/33Bモデル）。SARS-CoV-2ゲノムの生物学的に意味のある特性を記述する潜在空間を学習するために使用される。このGIFは、重要なタンパク質ファミリーであるリンゴ酸デヒドロゲナーゼ（malate dehydrogenase）を可視化し、配列の長さやGC含量（アデニンとチミンと比較した核酸グアニンとシトシンの含量の比率。これはDNA鎖が熱に耐える能力を測るものである。）などの重要な特徴で色付けされた潜在空間の投影を表示している。*
+</div>
+
+アルゴンヌ国立研究所が開発し、[2022年ACM Gordon Bell Special Prize](https://www.acm.org/media-center/2022/november/gordon-bell-special-prize-covid-research-2022)を受賞したゲノム用言語モデルである[GenSLMs](https://github.com/ramanathanlab/genslm)は、ゲノムデータに大規模言語モデル(LLM)を適用することにより、SARS-CoV-2(COVID-19)ゲノムの進化を学習します。これは、パンデミックを引き起こすウイルス、特にSARS-CoV-2の新たに出現する亜種を特定し、分類する方法を変えるように設計されています。GenSLMsは、他の予測タスクに一般化できる最初のゲノム基盤モデルの1つです。潜在空間をうまく表現することにより、GenSLMsはウイルス配列だけでなく新しいドメインに適用し、細菌性病原体や真核生物をモデル化する能力を拡大し、機能、経路のメンバーシップ、進化的関係などを理解することができます。この科学的目標を達成するために、GenSLMsおよび同様のモデルは、[FlashAttention](https://arxiv.org/abs/2307.08691)のように、長いシーケンスのための一般的な戦略では扱うことが困難なレベルの、非常に長いシーケンスサポートを、訓練と推論の両方に対して必要とします。DeepSpeed4Scienceの新しい設計により、科学者はより長いシーケンスでモデルを構築および訓練できるようになり、以前は扱えなかった科学探索が可能になりました。
+
+<div align="center">
+<img src="../media/Figure9.png" width="1000px" alt="" />
+
+*図9: 異なるスケールで異なるフレームワークがサポートする2つのGenSLMsモデルの最大シーケンス長。1ノードあたり8個の40G A100 GPUを搭載したNVIDIA DGXノードを使用。*
+</div>
+
+システムレベルでは、非常に長いシーケンスをサポートするための最新の[Megatron-DeepSpeedフレームワーク](https://github.com/microsoft/Megatron-DeepSpeed)を、[他の新しい最適化とともにリリースします](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support)。科学者は、（アテンションマスクと位置の埋め込みに関する）新しく追加されたメモリ最適化手法、テンソル並列処理、パイプライン並列処理、シーケンス並列処理、ZeROスタイルのデータ並列処理、モデル状態のオフロードなどの技術を相乗的な組み合わせにより、GenSLMsのような大規模な科学モデルをはるかに長いシーケンスで訓練できるようになりました。図9は、新しいリリースにより、GenSLMsの25Bおよび33Bモデルで、以前のMegatron-DeepSpeedよりもそれぞれ最大12倍および14倍の最長シーケンス長を処理できることを示しています。サポートされているシーケンス長に関しては、この新しいMegatron-DeepSpeedは、25Bモデルと33Bモデルでそれぞれ最大9.8倍と9.1倍でNVIDIAのMegatron-LMを大幅に上回っています。たとえば、GenSLMsの25Bモデルは、64個のGPUでのアルゴンヌチームの元の42Kシーケンス長と比較して、512Kのヌクレオチド配列で訓練できるようになりました。これにより、精度を損なうことなく、モデルの品質と科学的発見の範囲が大幅に向上します。Relative position embeddingなどのアルゴリズム戦略を必要とする科学者向けの追加サポートも、[このリリース](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/)に統合されています。
+
+## まとめとロードマップ
+
+DeepSpeed4Scienceイニシアティブを、いくつかのR&Dのハイライトや成果と共に発表できることを嬉しく思います。本日から、外部の協力者に関する情報や、現在および将来のDeepSpeed4Scienceテクノロジーリリースなど、新しいイニシアティブでの活動を[deepspeed4science.ai](https://deepspeed4science.ai/)上で進めていきます。私たちの高レベルな目標の1つは、大規模な科学的発見のための主要なシステムの問題点に広く対処するAIシステムテクノロジーを一般化することです。世界中の科学者によって、オープンソースのソフトウェアを通じてDeepSpeed4Scienceによって利用可能になる新機能が活用されることを願っています。科学的発見の障害となるAIシステム設計の課題を解決していくことを楽しみにしています。AI4Scienceの有望な未来を築くために、皆様の参加を歓迎します。お問い合わせは<deepspeed-info@microsoft.com>までお願いします。問題の報告や、PRを通じての貢献、ディスカッションへの参加は、[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/)でお願いします。
+
+## 謝辞
+
+**Core DeepSpeed4Science Team:**
+
+Shuaiwen Leon Song (DeepSpeed4Science lead), Minjia Zhang, Conglong Li, Shiyang Chen, Chengming Zhang, Xiaoxia (Shirley) Wu, Masahiro Tanaka, Martin Cai, Adam Graham, Charlie Zhou, Yuxiong He (DeepSpeed team lead)
+
+**Our Founding Collaborators (in alphabetical order):**
+
+**Argonne National Lab team:** Rick Stevens, Cristina Negri, Rao Kotamarthi, Venkatram Vishwanath, Arvind Ramanathan, Sam Foreman, Kyle Hippe, Troy Arcomano, Romit Maulik, Maxim Zvyagin, Alexander Brace, Yuntian Deng, Bin Zhang, Cindy Orozco Bohorquez, Austin Clyde, Bharat Kale, Danilo Perez-Rivera, Heng Ma, Carla M. Mann, Michael Irvin, J. Gregory Pauloski, Logan Ward, Valerie Hayot, Murali Emani, Zhen Xie, Diangen Lin, Maulik Shukla, Weili Nie, Josh Romero, Christian Dallago, Arash Vahdat, Chaowei Xiao, Thomas Gibbs, Ian Foster, James J. Davis, Michael E. Papka, Thomas Brettin, Anima Anandkumar
+
+**AMD:** Ivo Bolsen, Micheal Schulte, Bo Begole, Angela Dalton, Steve Reinhart, Ashwin Aji, Jalal Mahmud, Mahesh Balashibramanian
+
+**Brookhaven National Lab team:** Adolfy Hoisie, Shinjae Yoo, Yihui Ren.
+
+**Columbia University OpenFold team:** Mohammed AlQuraishi, Gustaf Ahdritz
+
+**Microsoft Research AI4Science team:** Christopher Bishop, Bonnie Kruft, Max Welling, Tie-Yan Liu, Christian Bodnar, Johannes Brandsetter, Wessel Bruinsma, Chan Cao, Yuan-Jyue Chen, Peggy Dai, Patrick Garvan, Liang He, Elizabeth Heider, PiPi Hu, Peiran Jin, Fusong Ju, Yatao Li, Chang Liu, Renqian Luo, Qi Meng, Frank Noe, Tao Qin, Janwei Zhu, Bin Shao, Yu Shi, Wenlei Shi, Gregor Simm, Megan Stanley, Lixin Sun, Yue Wang, Tong Wang, Zun Wang, Lijun Wu, Yingce Xia, Leo Xia, Shufang Xie, Shuxin Zheng, Jianwei Zhu
+
+**Oakridge National Lab team:** Prassana Balaprakash, Georgia Tourass
+
+**Princeton University:** William Tang, Kyle Felker, Alexey Svyatkovskiy (Microsoft liaison)
+
+**Rutgers University:** Hang Liu
+
+**WebXT Weather team:** Pete Luferenko, Divya Kumar, Jonathan Weyn, Ruixiong Zhang, Sylwester Klocek, Volodymyr Vragov
diff --git a/blogs/deepspeed4science/media/Figure1.png b/blogs/deepspeed4science/media/Figure1.png
new file mode 100644
index 000000000000..614c4b40d6a1
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure1.png differ
diff --git a/blogs/deepspeed4science/media/Figure2-1.jpg b/blogs/deepspeed4science/media/Figure2-1.jpg
new file mode 100644
index 000000000000..6008ccd91d09
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure2-1.jpg differ
diff --git a/blogs/deepspeed4science/media/Figure2-2.gif b/blogs/deepspeed4science/media/Figure2-2.gif
new file mode 100644
index 000000000000..0890be7d7e31
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure2-2.gif differ
diff --git a/blogs/deepspeed4science/media/Figure3.png b/blogs/deepspeed4science/media/Figure3.png
new file mode 100644
index 000000000000..465e80e15a25
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure3.png differ
diff --git a/blogs/deepspeed4science/media/Figure4.gif b/blogs/deepspeed4science/media/Figure4.gif
new file mode 100644
index 000000000000..b45a5f28fd36
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure4.gif differ
diff --git a/blogs/deepspeed4science/media/Figure5.gif b/blogs/deepspeed4science/media/Figure5.gif
new file mode 100644
index 000000000000..a26c20103269
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure5.gif differ
diff --git a/blogs/deepspeed4science/media/Figure6-1.png b/blogs/deepspeed4science/media/Figure6-1.png
new file mode 100644
index 000000000000..65f7f9309f71
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure6-1.png differ
diff --git a/blogs/deepspeed4science/media/Figure6-2.gif b/blogs/deepspeed4science/media/Figure6-2.gif
new file mode 100644
index 000000000000..b50588c227d7
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure6-2.gif differ
diff --git a/blogs/deepspeed4science/media/Figure7.jpg b/blogs/deepspeed4science/media/Figure7.jpg
new file mode 100644
index 000000000000..eaa92007268b
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure7.jpg differ
diff --git a/blogs/deepspeed4science/media/Figure8.gif b/blogs/deepspeed4science/media/Figure8.gif
new file mode 100644
index 000000000000..624384910f2a
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure8.gif differ
diff --git a/blogs/deepspeed4science/media/Figure9.png b/blogs/deepspeed4science/media/Figure9.png
new file mode 100644
index 000000000000..f00fd9b6917f
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure9.png differ
diff --git a/csrc/deepspeed4science/evoformer_attn/attention.cpp b/csrc/deepspeed4science/evoformer_attn/attention.cpp
new file mode 100644
index 000000000000..ac3364539ff1
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/attention.cpp
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <torch/extension.h>
+
+void attention_impl(torch::Tensor& q,
+                    torch::Tensor& k,
+                    torch::Tensor& v,
+                    torch::Tensor& bias1,
+                    torch::Tensor& bias2,
+                    torch::Tensor& o,
+                    torch::Tensor& lse);
+void attention(torch::Tensor& q,
+               torch::Tensor& k,
+               torch::Tensor& v,
+               torch::Tensor& bias1,
+               torch::Tensor& bias2,
+               torch::Tensor& o,
+               torch::Tensor& lse)
+{
+    attention_impl(q, k, v, bias1, bias2, o, lse);
+}
+
+void attention_back_impl(torch::Tensor& go,
+                         torch::Tensor& q,
+                         torch::Tensor& k,
+                         torch::Tensor& v,
+                         torch::Tensor& o,
+                         torch::Tensor& lse,
+                         torch::Tensor& delta,
+                         torch::Tensor& bias1,
+                         torch::Tensor& bias2,
+                         torch::Tensor& gq,
+                         torch::Tensor& gk,
+                         torch::Tensor& gv,
+                         torch::Tensor& gb1,
+                         torch::Tensor& gb2);
+void attention_bwd(torch::Tensor& go,
+                   torch::Tensor& q,
+                   torch::Tensor& k,
+                   torch::Tensor& v,
+                   torch::Tensor& o,
+                   torch::Tensor& lse,
+                   torch::Tensor& delta,
+                   torch::Tensor& bias1,
+                   torch::Tensor& bias2,
+                   torch::Tensor& gq,
+                   torch::Tensor& gk,
+                   torch::Tensor& gv,
+                   torch::Tensor& gb1,
+                   torch::Tensor& gb2)
+{
+    attention_back_impl(go, q, k, v, o, lse, delta, bias1, bias2, gq, gk, gv, gb1, gb2);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("attention", &attention, "");
+    m.def("attention_bwd", &attention_bwd, "");
+}
diff --git a/csrc/deepspeed4science/evoformer_attn/attention.cu b/csrc/deepspeed4science/evoformer_attn/attention.cu
new file mode 100644
index 000000000000..37636c4bf988
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/attention.cu
@@ -0,0 +1,160 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "gemm_kernel_utils.h"
+#include "kernel_forward.h"
+#include "transform/bias_broadcast.h"
+
+template <typename arch,
+          typename scalar_t,
+          typename torch_scalar_t,
+          template <typename, typename, typename>
+          class Broadcast1_,
+          template <typename, typename, typename>
+          class Broadcast2_>
+typename std::enable_if<!CheckArch<arch, scalar_t>::value>::type attention_impl_template(
+    torch::Tensor& q,
+    torch::Tensor& k,
+    torch::Tensor& v,
+    torch::Tensor& bias1,
+    torch::Tensor& bias2,
+    torch::Tensor& o,
+    float* lse_ptr)
+{
+    EVOFORMER_CHECK(false, "Unsupported GPU and data type combination")
+}
+
+template <typename arch,
+          typename scalar_t,
+          typename torch_scalar_t,
+          template <typename, typename, typename>
+          class Broadcast1_,
+          template <typename, typename, typename>
+          class Broadcast2_>
+typename std::enable_if<CheckArch<arch, scalar_t>::value>::type attention_impl_template(
+    torch::Tensor& q,
+    torch::Tensor& k,
+    torch::Tensor& v,
+    torch::Tensor& bias1,
+    torch::Tensor& bias2,
+    torch::Tensor& o,
+    float* lse_ptr)
+{
+    // Attention definition goes here, replaced with BroadcastType1 and
+    // BroadcastType2
+    using Attention = AttentionKernel<scalar_t, /* scalar_t */
+                                      arch,     /* ArchTag */
+                                      true,     /* Memory is aligned */
+                                      64,
+                                      64,
+                                      true,
+                                      true, /* Supports bias */
+                                      Broadcast1_,
+                                      Broadcast2_>;
+
+    static_assert(!Attention::kNeedsOutputAccumulatorBuffer,
+                  "This test does not support output accumulator buffer");
+    int head_size = q.size(-1);
+    int head_number = q.size(-2);
+    int seq_length = q.size(-3);
+    auto q_view = q.view({-1, seq_length, head_number, head_size});
+    auto k_view = k.view({-1, seq_length, head_number, head_size});
+    auto v_view = v.view({-1, seq_length, head_number, head_size});
+    auto o_view = o.view({-1, seq_length, head_number, head_size});
+    int batch_size = q_view.size(0);
+    auto q_ptr = reinterpret_cast<scalar_t*>(q.data_ptr<torch_scalar_t>());
+    auto k_ptr = reinterpret_cast<scalar_t*>(k.data_ptr<torch_scalar_t>());
+    auto v_ptr = reinterpret_cast<scalar_t*>(v.data_ptr<torch_scalar_t>());
+    auto o_ptr = reinterpret_cast<scalar_t*>(o.data_ptr<torch_scalar_t>());
+
+    auto bias1_ptr = reinterpret_cast<scalar_t*>(bias1.data_ptr<torch_scalar_t>());
+    auto bias2_ptr = reinterpret_cast<scalar_t*>(bias2.data_ptr<torch_scalar_t>());
+
+    typename Attention::Params p;
+    {  // set parameters
+        p.query_ptr = q_ptr;
+        p.key_ptr = k_ptr;
+        p.value_ptr = v_ptr;
+        p.logsumexp_ptr = lse_ptr;  // Only needed for bw
+        p.output_accum_ptr = nullptr;
+        p.output_ptr = o_ptr;
+        p.scale = 1.0f / sqrt(float(head_size));
+
+        p.bias1_ptr = bias1_ptr;
+        p.bias2_ptr = bias2_ptr;
+        p.B = q.size(0);
+        p.N = q.size(1);
+
+        p.num_heads = head_number;
+        p.num_batches = batch_size;
+        p.head_dim = head_size;
+        p.head_dim_value = head_size;
+        p.num_queries = seq_length;
+        p.num_keys = seq_length;
+
+        // All tensors are in BMHK shapes
+        p.q_strideH = q_view.stride(-2);
+        p.k_strideH = k_view.stride(-2);
+        p.v_strideH = v_view.stride(-2);
+        p.q_strideM = q_view.stride(-3);
+        p.k_strideM = k_view.stride(-3);
+        p.v_strideM = v_view.stride(-3);
+        p.o_strideM = o_view.stride(-3);
+        p.q_strideB = q_view.stride(-4);
+        p.k_strideB = k_view.stride(-4);
+        p.v_strideB = v_view.stride(-4);
+    }
+
+    constexpr auto kernel_fn = attention_kernel_batched_impl<Attention>;
+    int smem_bytes = sizeof(typename Attention::SharedStorage);
+    if (smem_bytes > 0xc000) {
+        cudaFuncSetAttribute(kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
+    }
+    if (!Attention::check_supported(p)) { throw std::runtime_error("Parameters not supported"); }
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+}
+
+#define CODE(scalar_t, torch_scalar_t)                                                          \
+    do {                                                                                        \
+        if (bias1.size(0) == 0 && bias2.size(0) == 0) {                                         \
+            attention_impl_template<ArchTag,                                                    \
+                                    scalar_t,                                                   \
+                                    torch_scalar_t,                                             \
+                                    BroadcastNoLoad,                                            \
+                                    BroadcastNoLoad>(q, k, v, bias1, bias2, o, lse_ptr);        \
+        } else if (bias1.size(0) == 0) {                                                        \
+            attention_impl_template<ArchTag,                                                    \
+                                    scalar_t,                                                   \
+                                    torch_scalar_t,                                             \
+                                    BroadcastNoLoad,                                            \
+                                    BroadcastB>(q, k, v, bias1, bias2, o, lse_ptr);             \
+        } else if (bias2.size(0) == 0) {                                                        \
+            attention_impl_template<ArchTag,                                                    \
+                                    scalar_t,                                                   \
+                                    torch_scalar_t,                                             \
+                                    BroadcastA,                                                 \
+                                    BroadcastNoLoad>(q, k, v, bias1, bias2, o, lse_ptr);        \
+        } else {                                                                                \
+            attention_impl_template<ArchTag, scalar_t, torch_scalar_t, BroadcastA, BroadcastB>( \
+                q, k, v, bias1, bias2, o, lse_ptr);                                             \
+        }                                                                                       \
+    } while (0)
+
+// Function to select and call the correct template based on biases sizes
+void attention_impl(torch::Tensor& q,
+                    torch::Tensor& k,
+                    torch::Tensor& v,
+                    torch::Tensor& bias1,
+                    torch::Tensor& bias2,
+                    torch::Tensor& o,
+                    torch::Tensor& lse)
+{
+    auto lse_ptr = lse.size(0) == 0 ? nullptr : reinterpret_cast<float*>(lse.data_ptr<float>());
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+    DISPATCH_ARCHTAG(prop->major * 10 + prop->minor,
+                     DISPATCH_TYPES(q, { CODE(scalar_t, torch_scalar_t); }));
+}
diff --git a/csrc/deepspeed4science/evoformer_attn/attention_back.cu b/csrc/deepspeed4science/evoformer_attn/attention_back.cu
new file mode 100644
index 000000000000..a82c4ec68a13
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/attention_back.cu
@@ -0,0 +1,218 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include <type_traits>
+#include "gemm_kernel_utils.h"
+#include "kernel_backward.h"
+#include "transform/bias_broadcast.h"
+
+constexpr auto kBlockSizeI = 64;
+constexpr auto kBlockSizeJ = 64;
+
+template <typename arch,
+          typename scalar_t,
+          typename torch_scalar_t,
+          template <typename, typename, typename>
+          class Broadcast1_,
+          template <typename, typename, typename>
+          class Broadcast2_>
+typename std::enable_if<!CheckArch<arch, scalar_t>::value>::type attention_back_impl_template(
+    torch::Tensor& go,
+    torch::Tensor& q,
+    torch::Tensor& k,
+    torch::Tensor& v,
+    torch::Tensor& o,
+    torch::Tensor& lse,
+    torch::Tensor& delta,
+    torch::Tensor& bias1,
+    torch::Tensor& bias2,
+    torch::Tensor& gq,
+    torch::Tensor& gk,
+    torch::Tensor& gv,
+    torch::Tensor& gb1,
+    torch::Tensor& gb2)
+{
+    EVOFORMER_CHECK(false, "Unsupported GPU and data type combination")
+}
+
+template <typename arch,
+          typename scalar_t,
+          typename torch_scalar_t,
+          template <typename, typename, typename>
+          class Broadcast1_,
+          template <typename, typename, typename>
+          class Broadcast2_>
+typename std::enable_if<CheckArch<arch, scalar_t>::value>::type attention_back_impl_template(
+    torch::Tensor& go,
+    torch::Tensor& q,
+    torch::Tensor& k,
+    torch::Tensor& v,
+    torch::Tensor& o,
+    torch::Tensor& lse,
+    torch::Tensor& delta,
+    torch::Tensor& bias1,
+    torch::Tensor& bias2,
+    torch::Tensor& gq,
+    torch::Tensor& gk,
+    torch::Tensor& gv,
+    torch::Tensor& gb1,
+    torch::Tensor& gb2)
+{
+    constexpr bool kPreload_ = arch::kMinComputeCapability >= 80;
+    using Kernel = AttentionBackwardKernel<arch,
+                                           scalar_t,     // scalar_t
+                                           true,         // kIsAligned_
+                                           false,        // kApplyDropout_
+                                           kPreload_,    // kPreload_
+                                           kBlockSizeI,  // kBlockSizeI_,
+                                           kBlockSizeJ,  // kBlockSizeJ_,
+                                           64,           // kMaxK
+                                           Broadcast1_,
+                                           Broadcast2_>;
+    int head_size = q.size(-1);
+    int head_number = q.size(-2);
+    int seq_length = q.size(-3);
+    auto q_view = q.view({-1, seq_length, head_number, head_size});
+    auto k_view = k.view({-1, seq_length, head_number, head_size});
+    auto v_view = v.view({-1, seq_length, head_number, head_size});
+    auto o_view = o.view({-1, seq_length, head_number, head_size});
+    auto do_view = go.view({-1, seq_length, head_number, head_size});
+    auto dk_view = gk.view({-1, seq_length, head_number, head_size});
+    auto dv_view = gv.view({-1, seq_length, head_number, head_size});
+    auto dq_view = gq.view({-1, seq_length, head_number, head_size});
+    auto q_ptr = reinterpret_cast<scalar_t*>(q.data_ptr<torch_scalar_t>());
+    auto k_ptr = reinterpret_cast<scalar_t*>(k.data_ptr<torch_scalar_t>());
+    auto v_ptr = reinterpret_cast<scalar_t*>(v.data_ptr<torch_scalar_t>());
+    auto o_ptr = reinterpret_cast<scalar_t*>(o.data_ptr<torch_scalar_t>());
+    auto do_ptr = reinterpret_cast<scalar_t*>(go.data_ptr<torch_scalar_t>());
+    auto dk_ptr = reinterpret_cast<scalar_t*>(gk.data_ptr<torch_scalar_t>());
+    auto dv_ptr = reinterpret_cast<scalar_t*>(gv.data_ptr<torch_scalar_t>());
+    auto dq_ptr = reinterpret_cast<scalar_t*>(gq.data_ptr<torch_scalar_t>());
+    auto db1_ptr = gb1.size(0) > 0 ? reinterpret_cast<float*>(gb1.data_ptr<float>()) : nullptr;
+    auto db2_ptr = gb2.size(0) > 0 ? reinterpret_cast<float*>(gb2.data_ptr<float>()) : nullptr;
+    auto lse_ptr = reinterpret_cast<float*>(lse.data_ptr<float>());
+    auto delta_ptr = reinterpret_cast<float*>(delta.data_ptr<float>());
+    auto bias1_ptr = reinterpret_cast<scalar_t*>(bias1.data_ptr<torch_scalar_t>());
+    auto bias2_ptr = reinterpret_cast<scalar_t*>(bias2.data_ptr<torch_scalar_t>());
+    static_assert(Kernel::kKernelComputesDelta, "Kernel must compute delta");
+
+    typename Kernel::Params p;
+    p.query_ptr = q_ptr;
+    p.key_ptr = k_ptr;
+    p.value_ptr = v_ptr;
+    p.logsumexp_ptr = lse_ptr;
+    p.output_ptr = o_ptr;
+    p.grad_output_ptr = do_ptr;
+    p.delta_ptr = delta_ptr;
+    p.grad_query_ptr = dq_ptr;
+    p.grad_key_ptr = dk_ptr;
+    p.grad_value_ptr = dv_ptr;
+
+    p.grad_bias1_ptr = db1_ptr;
+    p.grad_bias2_ptr = db2_ptr;
+    p.B = q.size(0);
+    p.N = q.size(1);
+    p.bias1_ptr = bias1.size(0) ? bias1_ptr : nullptr;
+    p.bias2_ptr = bias2.size(0) ? bias2_ptr : nullptr;
+
+    p.scale = 1.0f / sqrtf(head_size);
+
+    p.head_dim = head_size;
+    p.head_dim_value = head_size;
+    p.num_queries = seq_length;
+    p.num_keys = seq_length;
+    p.num_heads = head_number;
+
+    p.q_strideM = q_view.stride(-3);
+    p.k_strideM = k_view.stride(-3);
+    p.v_strideM = v_view.stride(-3);
+    p.gO_strideM = do_view.stride(-3);
+    p.o_strideH = o_view.stride(-2);
+    p.q_strideH = q_view.stride(-2);
+    p.k_strideH = k_view.stride(-2);
+    p.v_strideH = v_view.stride(-2);
+    p.o_strideB = o_view.stride(-4);
+    p.q_strideB = q_view.stride(-4);
+    p.k_strideB = k_view.stride(-4);
+    p.v_strideB = v_view.stride(-4);
+    p.lse_strideB = lse.stride(-3);
+    p.lse_strideH = lse.stride(-2);
+    p.delta_strideB = delta.stride(-3);
+    p.delta_strideH = delta.stride(-2);
+    p.num_batches = q_view.size(-4);
+
+    p.gO_strideB = do_view.stride(-4);
+    p.gQ_strideB = dq_view.stride(-4);
+    p.gK_strideB = dk_view.stride(-4);
+    p.gV_strideB = dv_view.stride(-4);
+    p.gO_strideH = do_view.stride(-2);
+    p.gQ_strideH = dq_view.stride(-2);
+    p.gK_strideH = dk_view.stride(-2);
+    p.gV_strideH = dv_view.stride(-2);
+
+    torch::Tensor workspace = torch::empty(p.workspace_size() / 4, lse.options());
+    p.workspace = workspace.data_ptr<float>();
+
+    auto kernel_fn = attention_kernel_backward_batched_impl<Kernel>;
+    size_t smem_bytes = sizeof(typename Kernel::SharedStorage);
+    cudaFuncSetAttribute(kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, int(smem_bytes));
+    if (!Kernel::check_supported(p)) { throw std::runtime_error("Unsupported parameters"); }
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+}
+
+#define CODE(scalar_t, torch_scalar_t)                                           \
+    do {                                                                         \
+        if (bias1.size(0) == 0 && bias2.size(0) == 0) {                          \
+            attention_back_impl_template<ArchTag,                                \
+                                         scalar_t,                               \
+                                         torch_scalar_t,                         \
+                                         BroadcastNoLoad,                        \
+                                         BroadcastNoLoad>(                       \
+                go, q, k, v, o, lse, delta, bias1, bias2, gq, gk, gv, gb1, gb2); \
+        } else if (bias1.size(0) > 0 && bias2.size(0) > 0) {                     \
+            attention_back_impl_template<ArchTag,                                \
+                                         scalar_t,                               \
+                                         torch_scalar_t,                         \
+                                         BroadcastA,                             \
+                                         BroadcastB>(                            \
+                go, q, k, v, o, lse, delta, bias1, bias2, gq, gk, gv, gb1, gb2); \
+        } else if (bias1.size(0) > 0) {                                          \
+            attention_back_impl_template<ArchTag,                                \
+                                         scalar_t,                               \
+                                         torch_scalar_t,                         \
+                                         BroadcastA,                             \
+                                         BroadcastNoLoad>(                       \
+                go, q, k, v, o, lse, delta, bias1, bias2, gq, gk, gv, gb1, gb2); \
+        } else {                                                                 \
+            attention_back_impl_template<ArchTag,                                \
+                                         scalar_t,                               \
+                                         torch_scalar_t,                         \
+                                         BroadcastNoLoad,                        \
+                                         BroadcastB>(                            \
+                go, q, k, v, o, lse, delta, bias1, bias2, gq, gk, gv, gb1, gb2); \
+        }                                                                        \
+    } while (0)
+
+void attention_back_impl(torch::Tensor& go,
+                         torch::Tensor& q,
+                         torch::Tensor& k,
+                         torch::Tensor& v,
+                         torch::Tensor& o,
+                         torch::Tensor& lse,
+                         torch::Tensor& delta,
+                         torch::Tensor& bias1,
+                         torch::Tensor& bias2,
+                         torch::Tensor& gq,
+                         torch::Tensor& gk,
+                         torch::Tensor& gv,
+                         torch::Tensor& gb1,
+                         torch::Tensor& gb2)
+{
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+    DISPATCH_ARCHTAG(prop->major * 10 + prop->minor,
+                     DISPATCH_TYPES(q, { CODE(scalar_t, torch_scalar_t); }));
+}
diff --git a/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_grad_bias.h b/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_grad_bias.h
new file mode 100644
index 000000000000..17b6479ed8c5
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_grad_bias.h
@@ -0,0 +1,250 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+#include <cutlass/epilogue/threadblock/default_epilogue_tensor_op.h>
+#include <cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h>
+#include "../iterators/predicated_tile_iterator_atomic.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+template <int Rank,
+          typename Shape_,
+          typename WarpMmaTensorOp_,
+          int PartitionsK,
+          typename OutputOp_,
+          int ElementsPerAccess>
+struct EpilogueTensorOpAffineRankN : public DefaultEpilogueTensorOpAffineRankN<Rank,
+                                                                               Shape_,
+                                                                               WarpMmaTensorOp_,
+                                                                               PartitionsK,
+                                                                               OutputOp_,
+                                                                               ElementsPerAccess> {
+    using Base = DefaultEpilogueTensorOpAffineRankN<Rank,
+                                                    Shape_,
+                                                    WarpMmaTensorOp_,
+                                                    PartitionsK,
+                                                    OutputOp_,
+                                                    ElementsPerAccess>;
+    using OutputTileIterator =
+        cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankNAtomic<
+            typename Base::OutputTileThreadMap,
+            typename Base::ElementOutput,
+            Rank>;
+
+    using Epilogue =
+        cutlass::epilogue::threadblock::Epilogue<typename Base::Shape,
+                                                 typename Base::WarpMmaTensorOp,
+                                                 Base::kPartitionsK,
+                                                 OutputTileIterator,
+                                                 typename Base::AccumulatorFragmentIterator,
+                                                 typename Base::WarpTileIterator,
+                                                 typename Base::SharedLoadIterator,
+                                                 typename Base::OutputOp,
+                                                 typename Base::Padding,
+                                                 Base::kFragmentsPerIteration>;
+};
+
+template <int Rank,
+          typename Shape_,
+          typename WarpMmaTensorOp_,
+          int PartitionsK,
+          typename OutputOp_,
+          int ElementsPerAccess>
+struct EpilogueVoltaTensorOpAffineRankN
+    : public DefaultEpilogueVoltaTensorOpAffineRankN<Rank,
+                                                     Shape_,
+                                                     WarpMmaTensorOp_,
+                                                     PartitionsK,
+                                                     OutputOp_,
+                                                     ElementsPerAccess> {
+    using Base = DefaultEpilogueVoltaTensorOpAffineRankN<Rank,
+                                                         Shape_,
+                                                         WarpMmaTensorOp_,
+                                                         PartitionsK,
+                                                         OutputOp_,
+                                                         ElementsPerAccess>;
+    using OutputTileIterator =
+        cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankNAtomic<
+            typename Base::OutputTileThreadMap,
+            typename Base::ElementOutput,
+            Rank>;
+
+    using Epilogue =
+        cutlass::epilogue::threadblock::Epilogue<typename Base::Shape,
+                                                 typename Base::WarpMmaTensorOp,
+                                                 Base::kPartitionsK,
+                                                 OutputTileIterator,
+                                                 typename Base::AccumulatorFragmentIterator,
+                                                 typename Base::WarpTileIterator,
+                                                 typename Base::SharedLoadIterator,
+                                                 typename Base::OutputOp,
+                                                 typename Base::Padding>;
+};
+
+template <typename Shape_,
+          typename WarpMmaTensorOp_,
+          int PartitionsK,
+          typename OutputOp_,
+          int ElementsPerAccess,
+          bool ScatterD = false,
+          typename PermuteDLayout = layout::NoPermute>
+struct EpilogueTensorOp : public DefaultEpilogueTensorOp<Shape_,
+                                                         WarpMmaTensorOp_,
+                                                         PartitionsK,
+                                                         OutputOp_,
+                                                         ElementsPerAccess,
+                                                         ScatterD,
+                                                         PermuteDLayout> {
+    using Base = DefaultEpilogueTensorOp<Shape_,
+                                         WarpMmaTensorOp_,
+                                         PartitionsK,
+                                         OutputOp_,
+                                         ElementsPerAccess,
+                                         ScatterD,
+                                         PermuteDLayout>;
+    using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAtomic<
+        typename Base::OutputTileThreadMap,
+        typename Base::ElementOutput,
+        ScatterD,
+        PermuteDLayout>;
+    using Epilogue =
+        cutlass::epilogue::threadblock::Epilogue<typename Base::Shape,
+                                                 typename Base::WarpMmaTensorOp,
+                                                 Base::kPartitionsK,
+                                                 OutputTileIterator,
+                                                 typename Base::AccumulatorFragmentIterator,
+                                                 typename Base::WarpTileIterator,
+                                                 typename Base::SharedLoadIterator,
+                                                 typename Base::OutputOp,
+                                                 typename Base::Padding,
+                                                 Base::kFragmentsPerIteration>;
+};
+
+template <typename Shape_,
+          typename WarpMmaTensorOp_,
+          int PartitionsK,
+          typename OutputOp_,
+          int ElementsPerAccess,
+          bool ScatterD = false,
+          typename PermuteDLayout = layout::NoPermute>
+struct EpilogueVoltaTensorOp : public DefaultEpilogueVoltaTensorOp<Shape_,
+                                                                   WarpMmaTensorOp_,
+                                                                   PartitionsK,
+                                                                   OutputOp_,
+                                                                   ElementsPerAccess,
+                                                                   ScatterD,
+                                                                   PermuteDLayout> {
+    using Base = DefaultEpilogueVoltaTensorOp<Shape_,
+                                              WarpMmaTensorOp_,
+                                              PartitionsK,
+                                              OutputOp_,
+                                              ElementsPerAccess,
+                                              ScatterD,
+                                              PermuteDLayout>;
+    using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAtomic<
+        typename Base::OutputTileThreadMap,
+        typename Base::ElementOutput,
+        ScatterD,
+        PermuteDLayout>;
+    using Epilogue =
+        cutlass::epilogue::threadblock::Epilogue<typename Base::Shape,
+                                                 typename Base::WarpMmaTensorOp,
+                                                 Base::kPartitionsK,
+                                                 OutputTileIterator,
+                                                 typename Base::AccumulatorFragmentIterator,
+                                                 typename Base::WarpTileIterator,
+                                                 typename Base::SharedLoadIterator,
+                                                 typename Base::OutputOp,
+                                                 typename Base::Padding>;
+};
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+template <typename Arch_,
+          typename Shape_,
+          typename WarpMmaTensorOp_,
+          int PartitionsK,
+          typename OutputOp_,
+          int ElementsPerAccess,
+          bool ScatterD = false,
+          typename PermuteDLayout = cutlass::layout::NoPermute>
+struct BiasGradEpilogue {
+    using Epilogue =
+        typename cutlass::epilogue::threadblock::EpilogueTensorOp<Shape_,
+                                                                  WarpMmaTensorOp_,
+                                                                  PartitionsK,
+                                                                  OutputOp_,
+                                                                  ElementsPerAccess,
+                                                                  ScatterD,
+                                                                  PermuteDLayout>::Epilogue;
+};
+
+template <typename Shape_,
+          typename WarpMmaTensorOp_,
+          int PartitionsK,
+          typename OutputOp_,
+          int ElementsPerAccess,
+          bool ScatterD,
+          typename PermuteDLayout>
+struct BiasGradEpilogue<cutlass::arch::Sm70,
+                        Shape_,
+                        WarpMmaTensorOp_,
+                        PartitionsK,
+                        OutputOp_,
+                        ElementsPerAccess,
+                        ScatterD,
+                        PermuteDLayout> {
+    using Epilogue =
+        typename cutlass::epilogue::threadblock::EpilogueVoltaTensorOp<Shape_,
+                                                                       WarpMmaTensorOp_,
+                                                                       PartitionsK,
+                                                                       OutputOp_,
+                                                                       ElementsPerAccess,
+                                                                       ScatterD,
+                                                                       PermuteDLayout>::Epilogue;
+};
+
+template <typename Arch_,
+          int Rank,
+          typename Shape_,
+          typename WarpMmaTensorOp_,
+          int PartitionsK,
+          typename OutputOp_,
+          int ElementsPerAccess>
+struct BiasGradEpilogueAffineRankN {
+    using Epilogue = typename cutlass::epilogue::threadblock::EpilogueTensorOpAffineRankN<
+        Rank,
+        Shape_,
+        WarpMmaTensorOp_,
+        PartitionsK,
+        OutputOp_,
+        ElementsPerAccess>::Epilogue;
+};
+
+template <int Rank,
+          typename Shape_,
+          typename WarpMmaTensorOp_,
+          int PartitionsK,
+          typename OutputOp_,
+          int ElementsPerAccess>
+struct BiasGradEpilogueAffineRankN<cutlass::arch::Sm70,
+                                   Rank,
+                                   Shape_,
+                                   WarpMmaTensorOp_,
+                                   PartitionsK,
+                                   OutputOp_,
+                                   ElementsPerAccess> {
+    using Epilogue = typename cutlass::epilogue::threadblock::EpilogueVoltaTensorOpAffineRankN<
+        Rank,
+        Shape_,
+        WarpMmaTensorOp_,
+        PartitionsK,
+        OutputOp_,
+        ElementsPerAccess>::Epilogue;
+};
diff --git a/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_pipelined.h b/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_pipelined.h
new file mode 100644
index 000000000000..3b7b32d61452
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_pipelined.h
@@ -0,0 +1,592 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  File copied from "cutlass/epilogue/threadblock/epilogue.h"
+  then modified to:
+  (1) load 2 source fragments at the same time (pipelining)
+  (2) support reading from a different dtype
+  (3) pass the row id to the OutputOp if it takes it
+    (see MemoryEfficientAttentionNormalize)
+  Note that in general the fragment passed to the OutputOp could
+  span multiple rows but it does not happen with the configurations we have
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <typename Op>
+struct ApplyEpilogueOp {
+    static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+        Op const& output_op,
+        int row_id,
+        typename Op::FragmentAccumulator const& accum,
+        typename Op::FragmentOutput const& source)
+    {
+        return output_op(accum, source);
+    }
+    static CUTLASS_DEVICE typename Op::FragmentOutput
+    apply(Op const& output_op, int row_id, typename Op::FragmentAccumulator const& accum)
+    {
+        return output_op(accum);
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <typename Shape_,               ///< Shape of threadblock tile (concept: GemmShape)
+          typename WarpMmaOperator_,     ///< Warp-level MMA operator (concept:
+                                         ///< gemm::warp::MmaTensorOp)
+          int PartitionsK,               ///< Number of partitions of the K dimension
+          typename OutputTileIterator_,  ///< Tile iterator writing output tensors
+          typename AccumulatorFragmentIterator_,  ///< Fragment iterator selecting
+                                                  ///< accumulators
+          typename WarpTileIterator_,             ///< Warp-scoped tile iterator writing
+                                                  ///< accumulators to SMEM
+          typename SharedLoadIterator_,           ///< Threadblock-scoped tile iterator loading
+                                                  ///< from SMEM
+          typename OutputOp_,                     ///< Output operator
+          typename Padding_,              ///< Padding added to SMEM allocation to avoid bank
+                                          ///< conflicts (concept: MatrixShape)
+          int FragmentsPerPartition = 1,  ///< Used to coarsten the epilogue granularity
+          int IterationsUnroll =          ///< Used to reduce binary size when epilogue op is
+                                          ///< large
+          (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+          typename OutputTileSourceIterator_ =
+              OutputTileIterator_  ///< Tile iterator reading tensors
+          >
+class EpiloguePipelined : public EpilogueBase<Shape_,
+                                              typename WarpMmaOperator_::Shape,
+                                              PartitionsK,
+                                              AccumulatorFragmentIterator_,
+                                              WarpTileIterator_,
+                                              Padding_,
+                                              FragmentsPerPartition> {
+public:
+    using Base = EpilogueBase<Shape_,
+                              typename WarpMmaOperator_::Shape,
+                              PartitionsK,
+                              AccumulatorFragmentIterator_,
+                              WarpTileIterator_,
+                              Padding_,
+                              FragmentsPerPartition>;
+
+    using Shape = Shape_;
+    using WarpMmaOperator = WarpMmaOperator_;
+    static int const kPartitionsK = PartitionsK;
+    using OutputTileIterator = OutputTileIterator_;
+    using OutputTileSourceIterator = OutputTileSourceIterator_;
+    using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+    using WarpTileIterator = WarpTileIterator_;
+    using SharedLoadIterator = SharedLoadIterator_;
+    using OutputOp = OutputOp_;
+    using Padding = Padding_;
+
+    using Layout = layout::RowMajor;
+    using LongIndex = typename Layout::LongIndex;
+
+    /// The complete warp-level accumulator tile
+    using AccumulatorTile = typename Base::AccumulatorTile;
+
+    /// Accumulator element
+    using ElementAccumulator = typename WarpTileIterator::Element;
+
+    /// Output element
+    using ElementOutput = typename OutputTileIterator::Element;
+    using ElementSource = typename OutputTileSourceIterator::Element;
+
+    /// Output access size
+    static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+    /// Tensor reference to destination tensor
+    using TensorRef = typename OutputTileIterator::TensorRef;
+
+    /// Tensor reference to sync tensor
+    using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+    /// Const tensor reference to source tensor
+    using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+    /// Array type used to output
+    using OutputAccessType =
+        Array<typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+    using SourceAccessType = Array<typename OutputTileSourceIterator::Element,
+                                   OutputTileSourceIterator::kElementsPerAccess>;
+
+    /// Array type used by output functor
+    using AccumulatorAccessType =
+        Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+    /// Number of warps
+    using WarpCount = typename Base::WarpCount;
+
+    static int constexpr kSmemTiles =
+        Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+    static int constexpr kSmemPointerOffset =
+        Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+public:
+    static_assert(OutputTileSourceIterator::Fragment::kElements ==
+                      OutputTileIterator::Fragment::kElements,
+                  "Mismatch between input tile and output tile iterator (kElements)");
+    static_assert(OutputTileSourceIterator::kIterations == OutputTileIterator::kIterations,
+                  "Mismatch between input tile and output tile iterator (kIterations)");
+    static_assert(SharedLoadIterator::Fragment::kElements ==
+                      OutputTileIterator::Fragment::kElements,
+                  "Mismatch between shared load iterator and output tile iterator.");
+
+    static_assert(OutputTileIterator::kElementsPerAccess,
+                  "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+    static_assert(!(OutputTileIterator::Fragment::kElements %
+                    OutputTileIterator::kElementsPerAccess),
+                  "Divisibility");
+
+private:
+    /// Loads fragment from shared memory aligned with output tensor
+    SharedLoadIterator shared_load_iterator_;
+
+public:
+    /// Constructor
+    CUTLASS_DEVICE
+    EpiloguePipelined(typename Base::SharedStorage& shared_storage,  ///< Shared storage object
+                      int thread_idx,  ///< ID of a thread within the threadblock
+                      int warp_idx,    ///< ID of warp within threadblock
+                      int lane_idx     ///< Id of thread within warp
+                      )
+        : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+          shared_load_iterator_(shared_storage.reference(), thread_idx)
+    {
+    }
+
+    /// Streams the result to global memory
+    CUTLASS_DEVICE
+    void operator()(OutputOp const& output_op,                ///< Output operator
+                    OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+                    AccumulatorTile const& accumulators,  ///< Complete warp-level accumulator tile
+                    OutputTileSourceIterator source_iterator)
+    {  ///< Threadblock tile coordinate in GEMM (in units
+       ///< of threadblock tiles)
+
+        if (!output_op.is_source_needed()) {
+            compute_source_not_needed_(output_op, destination_iterator, accumulators);
+        } else {
+            compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator);
+        }
+    }
+    CUTLASS_DEVICE
+    void operator()(OutputOp const& output_op,                ///< Output operator
+                    OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+                    AccumulatorTile const& accumulators)
+    {  ///< Complete warp-level accumulator tile
+        compute_source_not_needed_(output_op, destination_iterator, accumulators);
+    }
+
+private:
+    template <class Seq>
+    struct acc2smem_source_not_needed;
+
+    template <size_t... Seq>
+    struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+        template <int Advance>
+        CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                                          WarpTileIterator& warp_tile_iterator)
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < Advance; i++) { ++accum_fragment_iterator; }
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+                typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+                accum_fragment_iterator.load(accum_fragment);
+                ++accum_fragment_iterator;
+
+                warp_tile_iterator.store(accum_fragment);
+                if (p < Base::kFragmentsPerIteration - 1) {
+                    warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+                }
+            }
+
+            if (Base::kFragmentsPerIteration > 1) {
+                warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
+                                                      (1 - Base::kFragmentsPerIteration));
+            }
+        }
+
+        CUTLASS_DEVICE
+        static void push(size_t pos,
+                         AccumulatorFragmentIterator const& iterator_begin,
+                         WarpTileIterator& warp_tile_iterator)
+        {
+            int dummy[] = {
+                (pos == (Seq * Base::kFragmentsPerIteration)) &&
+                (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator),
+                 0)...};
+
+            CUTLASS_UNUSED(dummy[0]);
+        }
+    };
+
+    static_assert(kPartitionsK == 1 || Base::kFragmentsPerIteration == 1,
+                  "One of these must be exactly 1.");
+
+    /// Streams the result to global memory
+    CUTLASS_DEVICE
+    void compute_source_not_needed_(
+        OutputOp const& output_op,                ///< Output operator
+        OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+        AccumulatorTile const& accumulators       ///< Complete warp-level accumulator tile
+    )
+    {
+        //
+        // Iterator over warp-level accumulator fragment
+        //
+
+        AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+        //
+        // Iterate over accumulator tile
+        //
+
+#pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration \
+                                : 1)
+        for (int iter = 0; iter < OutputTileIterator::kIterations;
+             iter += Base::kFragmentsPerIteration) {
+            //
+            // Convert and store fragment
+            //
+
+            __syncthreads();
+
+            acc2smem_source_not_needed<cutlass::make_index_sequence<
+                OutputTileIterator::kIterations / Base::kFragmentsPerIteration>>::
+                push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+            __syncthreads();
+
+            //
+            // Load fragments from shared memory
+            //
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+                typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+                shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+                if (p < Base::kFragmentsPerIteration - 1) {
+                    shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+                } else if (kPartitionsK > 1) {
+                    plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int i = 1; i < kPartitionsK; ++i) {
+                        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+                        shared_load_iterator_.load(aligned_accum_fragment[i]);
+                        aligned_accum_fragment[0] =
+                            add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+                    }
+
+                    shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) *
+                                                             kSmemPointerOffset);
+                }
+
+                //
+                // Compute the output result
+                //
+
+                typename OutputTileIterator::Fragment output_fragment;
+
+                apply_output_operator_source_not_needed_(destination_iterator.thread_start_row(),
+                                                         output_fragment,
+                                                         output_op,
+                                                         aligned_accum_fragment[0]);
+
+                //
+                // Store the final result
+                //
+
+                destination_iterator.store(output_fragment);
+                ++destination_iterator;
+            }
+
+            if (Base::kFragmentsPerIteration > 1) {
+                shared_load_iterator_.add_pointer_offset(kSmemPointerOffset *
+                                                         (1 - Base::kFragmentsPerIteration));
+            }
+        }
+    }
+
+    template <class Seq>
+    struct acc2smem_source_needed;
+
+    template <size_t... Seq>
+    struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+        template <int Advance>
+        CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                                          WarpTileIterator& warp_tile_iterator)
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < Advance; i++) { ++accum_fragment_iterator; }
+
+            typename AccumulatorFragmentIterator::Fragment accum_fragment;
+            accum_fragment_iterator.load(accum_fragment);
+            warp_tile_iterator.store(accum_fragment);
+        }
+
+        CUTLASS_DEVICE
+        static void push(size_t pos,
+                         AccumulatorFragmentIterator const& iterator_begin,
+                         WarpTileIterator& warp_tile_iterator)
+        {
+            int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+        }
+    };
+
+    /// Streams the result to global memory
+    CUTLASS_DEVICE
+    void compute_source_needed_(
+        OutputOp const& output_op,                ///< Output operator
+        OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+        AccumulatorTile const& accumulators,      ///< Complete warp-level accumulator tile
+        OutputTileSourceIterator source_iterator  ///< Threadblock tile coordinate in GEMM (in units
+                                                  ///< of threadblock tiles)
+    )
+    {
+        typename OutputTileSourceIterator::Fragment source_fragment[2];
+
+        source_fragment[0].clear();
+        source_iterator.load(source_fragment[0]);
+        ++source_iterator;
+        source_fragment[1].clear();
+
+        //
+        // Iterator over warp-level accumulator fragment
+        //
+
+        AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+        //
+        // Iterate over accumulator tile
+        //
+
+#pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+        for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+            if (iter > 0) { __syncthreads(); }
+            //
+            // Load the source for next iteration (pipelining)
+            //
+
+            if (iter + 1 < OutputTileIterator::kIterations) {
+                source_iterator.load(source_fragment[(iter + 1) % 2]);
+            }
+            ++source_iterator;
+            acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::
+                push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+            __syncthreads();
+
+            //
+            // Load fragments from shared memory
+            //
+
+            typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+            shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+            // If the number of k-slices is > 1 - perform a reduction amongst the
+            // k-slices
+            if (kPartitionsK > 1) {
+                plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int i = 1; i < kPartitionsK; ++i) {
+                    shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+                    shared_load_iterator_.load(aligned_accum_fragment[i]);
+                    aligned_accum_fragment[0] =
+                        add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+                }
+
+                shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+            }
+
+            //
+            // Compute the output result
+            //
+
+            typename OutputTileIterator::Fragment output_fragment;
+
+            apply_output_operator_(destination_iterator.thread_start_row(),
+                                   output_fragment,
+                                   output_op,
+                                   aligned_accum_fragment[0],
+                                   source_fragment[iter % 2]);
+
+            //
+            // Store the final result
+            //
+
+            destination_iterator.store(output_fragment);
+            ++destination_iterator;
+        }
+    }
+
+    /// Helper to invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator_(int begin_row,
+                                typename OutputTileIterator::Fragment& output_fragment,
+                                OutputOp const& output_op,  ///< Output operator
+                                typename SharedLoadIterator::Fragment const& aligned_accum_fragment,
+                                typename OutputTileSourceIterator::Fragment const& source_fragment)
+    {
+        OutputAccessType* output_frag_ptr = reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+        AccumulatorAccessType const* compute_frag_ptr =
+            reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+        SourceAccessType const* source_frag_ptr =
+            reinterpret_cast<SourceAccessType const*>(&source_fragment);
+
+        int const kOutputOpIterations =
+            OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kOutputOpIterations; ++i) {
+            // Call the output operator
+            output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+                output_op,
+                begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+                compute_frag_ptr[i],
+                source_frag_ptr[i]);
+        }
+    }
+
+    /// Helper to invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator_source_not_needed_(
+        int begin_row,
+        typename OutputTileIterator::Fragment& output_fragment,
+        OutputOp const& output_op,  ///< Output operator
+        typename SharedLoadIterator::Fragment const& aligned_accum_fragment)
+    {
+        OutputAccessType* output_frag_ptr = reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+        AccumulatorAccessType const* compute_frag_ptr =
+            reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+        int const kOutputOpIterations =
+            OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kOutputOpIterations; ++i) {
+            // Call the output operator
+            output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+                output_op,
+                begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+                compute_frag_ptr[i]);
+        }
+    }
+
+    // This should be constexpr, but it's only supported on c++14
+    static int CUTLASS_HOST_DEVICE getRowOffset(int i)
+    {
+        using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+                    int row_offset = row * ThreadMap::Delta::kRow +
+                                     group * ThreadMap::Delta::kGroup +
+                                     cluster * ThreadMap::Delta::kCluster;
+                    int frag_row_idx =
+                        (row + ThreadMap::Iterations::kRow *
+                                   (group + ThreadMap::Iterations::kGroup * cluster));
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+                        int frag_idx = ThreadMap::kElementsPerAccess *
+                                       (frag_row_idx * ThreadMap::Iterations::kColumn + column);
+                        if (i < frag_idx + ThreadMap::kElementsPerAccess) { return row_offset; }
+                    }
+                }
+            }
+        }
+        return -1;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_rescale_output.h b/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_rescale_output.h
new file mode 100644
index 000000000000..f81a09f74f1e
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_rescale_output.h
@@ -0,0 +1,251 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory
+  to match canonical tensor layouts in global memory. Epilogues support
+  conversion and reduction operations.
+
+  This is a copy of cutlass/epilogue/threadblock/epilogue.h that can
+  handle "row_id" as a first argument, as uses it to get the corresponding
+  `m_prime` / `s_prime` to rescale the output.
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "epilogue_pipelined.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+// output <- alpha * accumulator + beta * source
+//   with:
+//     alpha = 1 / s_prime (to normalize when isLast=True, 1 otherwise)
+//     beta = alpha / m_prime (renormalize the output when the max changes)
+//     source is the current output
+template <typename ElementOutput_,  ///< Data type used to store tensors
+          typename ElementSource_,  //< Data type for source (usually matches
+                                    //`ElementOutput`)
+          int Count,                ///< Number of elements computed per operation.
+                                    ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+          ///< but we use 64 or 32 sometimes when there are not enough data
+          ///< to store
+          typename ElementAccumulator_,  ///< Accumulator data type
+          typename ElementCompute_,      ///< Data type used to compute linear combination
+          bool isFirst,
+          bool isLast,
+          typename FragmentAlphaBeta_,
+          FloatRoundStyle Round = FloatRoundStyle::round_to_nearest>
+class MemoryEfficientAttentionNormalize {
+public:
+    using ElementOutput = ElementOutput_;
+    using ElementSource = ElementSource_;
+    using ElementAccumulator = ElementAccumulator_;
+    using ElementCompute = ElementCompute_;
+
+    static int const kCount = Count;
+
+    using FragmentOutput = Array<ElementOutput, kCount>;
+    using FragmentSource = Array<ElementSource, kCount>;
+    using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+    using ComputeFragment = Array<ElementCompute, kCount>;
+    using FragmentAlphaBeta = FragmentAlphaBeta_;
+
+    static FloatRoundStyle const kRound = Round;
+
+private:
+    //
+    // Data members
+    //
+
+    FragmentAlphaBeta const& s_prime_;
+    FragmentAlphaBeta const& m_prime_;
+
+public:
+    /// Constructs the function object, possibly loading from pointers in host
+    /// memory
+    CUTLASS_HOST_DEVICE
+    MemoryEfficientAttentionNormalize(FragmentAlphaBeta const& s_prime,
+                                      FragmentAlphaBeta const& m_prime)
+        : s_prime_(s_prime), m_prime_(m_prime)
+    {
+    }
+
+    /// Returns true if source is needed
+    CUTLASS_HOST_DEVICE
+    bool is_source_needed() const { return !isFirst; }
+
+    /// Functionally required for serial reduction in the epilogue
+    CUTLASS_HOST_DEVICE
+    void set_k_partition(int k_partition, int k_partition_count) {}
+
+    /// Computes linear scaling: D = alpha * accumulator + beta * source
+    CUTLASS_HOST_DEVICE
+    FragmentOutput operator()(int row,
+                              FragmentAccumulator const& accumulator,
+                              FragmentSource const& source) const
+    {
+        assert(!isFirst);
+
+        // Convert source to internal compute numeric type
+        NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+        NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+            accumulator_converter;
+
+        // Convert to destination numeric type
+        NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+        ComputeFragment converted_source = source_converter(source);
+        ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+        // Perform binary operations
+        ComputeFragment intermediate;
+
+        multiplies<ComputeFragment> mul_add_source;
+        multiply_add<ComputeFragment> mul_add_accumulator;
+
+        ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+        ElementCompute beta = alpha * m_prime_[row];
+
+        intermediate = mul_add_source(beta, converted_source);  // X =  beta * C
+
+        intermediate = mul_add_accumulator(
+            alpha, converted_accumulator, intermediate);  // D = alpha * Accum + X
+
+        return destination_converter(intermediate);
+    }
+
+    /// Computes linear scaling: D = alpha * accumulator
+    CUTLASS_HOST_DEVICE
+    FragmentOutput operator()(int row, FragmentAccumulator const& accumulator) const
+    {
+        assert(isFirst);
+
+        // Convert source to internal compute numeric type
+        NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+            accumulator_converter;
+
+        // Convert to destination numeric type
+        NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+        ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+        ComputeFragment intermediate;
+        multiplies<ComputeFragment> mul_accumulator;
+
+        ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+
+        intermediate = mul_accumulator(alpha, converted_accumulator);  // X =  alpha * C + uniform
+
+        return destination_converter(intermediate);
+    }
+};
+
+}  // namespace thread
+
+namespace threadblock {
+template <typename EO,
+          typename ES,
+          int Count,
+          typename EA,
+          typename EC,
+          bool F,
+          bool L,
+          typename FAB,
+          FloatRoundStyle R>
+struct ApplyEpilogueOp<
+    thread::MemoryEfficientAttentionNormalize<EO, ES, Count, EA, EC, F, L, FAB, R>> {
+    using Op = thread::MemoryEfficientAttentionNormalize<EO, ES, Count, EA, EC, F, L, FAB, R>;
+    static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+        Op const& output_op,
+        int row_id,
+        typename Op::FragmentAccumulator const& accum,
+        typename Op::FragmentSource const& source)
+    {
+        return output_op(row_id, accum, source);
+    }
+    static CUTLASS_DEVICE typename Op::FragmentOutput
+    apply(Op const& output_op, int row_id, typename Op::FragmentAccumulator const& accum)
+    {
+        return output_op(row_id, accum);
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_thread_apply_logsumexp.h b/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_thread_apply_logsumexp.h
new file mode 100644
index 000000000000..46fb2bf17c1c
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/epilogue/epilogue_thread_apply_logsumexp.h
@@ -0,0 +1,168 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include <cuda_fp16.h>
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element, int ElementsPerAccess>
+struct ArrayExponential {
+    CUTLASS_HOST_DEVICE
+    Array<Element, ElementsPerAccess> operator()(
+        Array<Element, ElementsPerAccess> const& input) const
+    {
+        Array<Element, ElementsPerAccess> result;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < ElementsPerAccess; ++i) { result[i] = expf(input[i]); }
+
+        return result;
+    }
+};
+
+template <int ElementsPerAccess>
+struct ArrayExponential<half_t, ElementsPerAccess> {
+    CUTLASS_DEVICE
+    Array<half_t, ElementsPerAccess> operator()(Array<half_t, ElementsPerAccess> const& input) const
+    {
+        Array<half_t, ElementsPerAccess> result;
+
+        int const kVectorCount = ElementsPerAccess / 2;
+
+        __half2 const* input_ptr = reinterpret_cast<__half2 const*>(input.raw_data());
+        __half2* res_ptr = reinterpret_cast<__half2*>(result.raw_data());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kVectorCount; ++i) { res_ptr[i] = h2exp(input_ptr[i]); }
+
+        return result;
+    }
+};
+}  // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies:
+/// output <- (input - lse).exp()
+template <typename ElementOutput_,       // output
+          typename ElementLSE_,          // accumulator from LSE
+          typename ElementAccumulator_,  // accumulator from matmul
+          typename ElementCompute_,      // intermediate compute (and exp calculation)
+          int ElementsPerAccess>
+class ApplyLogSumExp {
+public:
+    using ElementOutput = ElementOutput_;
+    using ElementAccumulator = ElementAccumulator_;
+    using ElementCompute = ElementCompute_;
+    using ElementLSE = ElementLSE_;
+
+    static int const kElementsPerAccess = ElementsPerAccess;
+    static int const kCount = kElementsPerAccess;
+    static const ScaleType::Kind kScale = cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+
+    using FragmentOutput = Array<ElementOutput, kCount>;
+    using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+    using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+    using FragmentLSE = Array<ElementLSE, kElementsPerAccess>;
+    using FragmentScaleBias = FragmentLSE;  // Used by epilogue_smem_accumulator.h
+
+public:
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    ApplyLogSumExp() {}
+
+    /// Returns true if source is needed
+    CUTLASS_HOST_DEVICE
+    bool is_source_needed() const { return true; }
+
+    /// Functionally required for serial reduction in the epilogue
+    CUTLASS_HOST_DEVICE
+    void set_k_partition(int k_partition, int k_partition_count) {}
+
+    CUTLASS_HOST_DEVICE
+    FragmentOutput operator()(FragmentAccumulator const& AB,
+                              FragmentLSE const& scale_unused,
+                              // bias used as LSE
+                              FragmentLSE const& bias) const
+    {
+        FragmentCompute frag_AB =
+            NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+        FragmentCompute frag_lse_compute =
+            NumericArrayConverter<ElementCompute, ElementLSE, kElementsPerAccess>()(bias);
+        FragmentCompute frag_compute;
+
+        minus<FragmentCompute> minus_lse;
+        detail::ArrayExponential<ElementCompute, kElementsPerAccess> apply_exp;
+        frag_compute = minus_lse(frag_AB, frag_lse_compute);
+        frag_compute = apply_exp(frag_compute);
+
+        return NumericArrayConverter<ElementOutput, ElementCompute, kElementsPerAccess>()(
+            frag_compute);
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace thread
+}  // namespace epilogue
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma.h b/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma.h
new file mode 100644
index 000000000000..75833bbfe7d2
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma.h
@@ -0,0 +1,119 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "custom_mma_multistage.h"
+#include "custom_mma_pipelined.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+
+template <typename Mma, int kMaxK>
+struct MakeCustomMma;
+
+template <typename Shape,
+          typename IteratorA,
+          typename SmemIteratorA,
+          cutlass::arch::CacheOperation::Kind CacheOpA,
+          typename IteratorB,
+          typename SmemIteratorB,
+          cutlass::arch::CacheOperation::Kind CacheOpB,
+          typename ElementC,
+          typename LayoutC,
+          typename Policy,
+          int Stages,
+          cutlass::gemm::SharedMemoryClearOption SharedMemoryClear,
+          int kMaxK>
+struct MakeCustomMma<cutlass::gemm::threadblock::MmaMultistage<Shape,
+                                                               IteratorA,
+                                                               SmemIteratorA,
+                                                               CacheOpA,
+                                                               IteratorB,
+                                                               SmemIteratorB,
+                                                               CacheOpB,
+                                                               ElementC,
+                                                               LayoutC,
+                                                               Policy,
+                                                               Stages,
+                                                               SharedMemoryClear>,
+                     kMaxK> {
+    // Reduce the number of stages if we don't need that many
+    static int constexpr kStages =
+        kMaxK == cutlass::platform::numeric_limits<int>::max()
+            ? Stages
+            : cutlass::const_min(Stages, (kMaxK + int(Shape::kK) - 1) / int(Shape::kK));
+    using Mma = cutlass::gemm::threadblock::CustomMmaMultistage<Shape,
+                                                                IteratorA,
+                                                                SmemIteratorA,
+                                                                CacheOpA,
+                                                                IteratorB,
+                                                                SmemIteratorB,
+                                                                CacheOpB,
+                                                                ElementC,
+                                                                LayoutC,
+                                                                Policy,
+                                                                kStages,
+                                                                SharedMemoryClear,
+                                                                kMaxK>;
+};
+
+template <typename Shape,
+          typename IteratorA,
+          typename SmemIteratorA,
+          typename IteratorB,
+          typename SmemIteratorB,
+          typename ElementC,
+          typename LayoutC,
+          typename Policy,
+          int kMaxK>
+struct MakeCustomMma<cutlass::gemm::threadblock::MmaPipelined<Shape,
+                                                              IteratorA,
+                                                              SmemIteratorA,
+                                                              IteratorB,
+                                                              SmemIteratorB,
+                                                              ElementC,
+                                                              LayoutC,
+                                                              Policy>,
+                     kMaxK> {
+    using Mma = cutlass::gemm::threadblock::CustomMmaPipelined<Shape,
+                                                               IteratorA,
+                                                               SmemIteratorA,
+                                                               IteratorB,
+                                                               SmemIteratorB,
+                                                               ElementC,
+                                                               LayoutC,
+                                                               Policy>;
+};
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_base.h b/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_base.h
new file mode 100644
index 000000000000..bbf91240b900
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_base.h
@@ -0,0 +1,181 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaBase {
+public:
+    ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+    using Shape = Shape_;
+
+    ///< Policy describing tuning details
+    using Policy = Policy_;
+
+    //
+    // Dependent types
+    //
+
+    /// Warp-level Mma
+    using Operator = typename Policy::Operator;
+
+    /// Shape describing the overall GEMM computed from shared memory
+    /// by each warp.
+    using WarpGemm = typename Policy::Operator::Shape;
+
+    /// Shape describing the number of warps filling the CTA
+    using WarpCount =
+        GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
+
+    /// Number of warp-level GEMM oeprations
+    static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    //
+    // Nested structs
+    //
+
+    /// Shared storage object needed by threadblock-scoped GEMM
+    template <typename Element, typename OperandShape, typename OperandLayout>
+    struct OperandSharedStorage {
+        AlignedBuffer<Element, OperandShape::kCount> buffer;
+        using TensorRef = TensorRef<Element, OperandLayout>;
+
+        CUTLASS_DEVICE
+        static OperandLayout Layout()
+        {
+            return OperandLayout::packed({OperandShape::kRow, OperandShape::kColumn});
+        }
+
+        /// Returns a TensorRef to the operand
+        CUTLASS_HOST_DEVICE
+        TensorRef ref() { return TensorRef{buffer.data(), Layout()}; }
+    };
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB = MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                               Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+    using SharedStorageA =
+        OperandSharedStorage<typename Operator::ElementA, ShapeA, typename Operator::LayoutA>;
+    using SharedStorageB =
+        OperandSharedStorage<typename Operator::ElementB, ShapeB, typename Operator::LayoutB>;
+    using TensorRefA = typename SharedStorageA::TensorRef;
+    using TensorRefB = typename SharedStorageB::TensorRef;
+
+    struct SharedStorage {
+        /// Buffer for A operand
+        SharedStorageA operand_A;
+
+        /// Buffer for B operand
+        SharedStorageB operand_B;
+    };
+
+protected:
+    //
+    // Data members
+    //
+
+    /// Iterator to load a warp-scoped tile of A operand from shared memory
+    typename Operator::IteratorA warp_tile_iterator_A_;
+
+    /// Iterator to load a warp-scoped tile of B operand from shared memory
+    typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+    /// Construct from tensor references
+    CUTLASS_DEVICE
+    CustomMmaBase(
+        ///< Shared storage needed for internal use by threadblock-scoped GEMM
+        SharedStorageA& shared_storageA,
+        SharedStorageB& shared_storageB,
+        ///< ID within the threadblock
+        int thread_idx,
+        ///< ID of warp
+        int warp_idx,
+        ///< ID of each thread within a warp
+        int lane_idx)
+        : warp_tile_iterator_A_(shared_storageA.ref(), lane_idx),
+          warp_tile_iterator_B_(shared_storageB.ref(), lane_idx)
+    {
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_multistage.h b/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_multistage.h
new file mode 100644
index 000000000000..50ba58b1d1dd
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_multistage.h
@@ -0,0 +1,706 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "custom_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Upper boundon the K dimension
+    int kMaxK = cutlass::platform::numeric_limits<int>::max(),
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaMultistage : public CustomMmaBase<Shape_, Policy_, Stages> {
+public:
+    ///< Base class
+    using Base = CustomMmaBase<Shape_, Policy_, Stages>;
+    ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+    using Shape = Shape_;
+    ///< Iterates over tiles of A operand in global memory
+    using IteratorA = IteratorA_;
+    ///< Iterates over tiles of B operand in global memory
+    using IteratorB = IteratorB_;
+    ///< Data type of accumulator matrix
+    using ElementC = ElementC_;
+    ///< Layout of accumulator matrix
+    using LayoutC = LayoutC_;
+    ///< Policy describing tuning details
+    using Policy = Policy_;
+
+    using SmemIteratorA = SmemIteratorA_;
+    using SmemIteratorB = SmemIteratorB_;
+
+    static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+    static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+    //
+    // Dependent types
+    //
+
+    /// Fragment of accumulator tile
+    using FragmentC = typename Policy::Operator::FragmentC;
+
+    /// Warp-level Mma
+    using Operator = typename Policy::Operator;
+
+    /// Minimum architecture is Sm80 to support cp.async
+    using ArchTag = arch::Sm80;
+
+    /// Complex transform on A operand
+    static ComplexTransform const kTransformA = Operator::kTransformA;
+
+    /// Complex transform on B operand
+    static ComplexTransform const kTransformB = Operator::kTransformB;
+
+    /// Internal structure exposed for introspection.
+    struct Detail {
+        static_assert(Base::kWarpGemmIterations > 1,
+                      "The pipelined structure requires at least two warp-level "
+                      "GEMM operations.");
+
+        /// Number of cp.async instructions to load one stage of operand A
+        static int const AsyncCopyIterationsPerStageA = IteratorA::ThreadMap::Iterations::kCount;
+
+        /// Number of cp.async instructions to load one stage of operand B
+        static int const AsyncCopyIterationsPerStageB = IteratorB::ThreadMap::Iterations::kCount;
+
+        /// Number of stages
+        static int const kStages = Stages;
+
+        /// Number of cp.async instructions to load on group of operand A
+        static int const kAccessesPerGroupA =
+            (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) /
+            Base::kWarpGemmIterations;
+
+        /// Number of cp.async instructions to load on group of operand B
+        static int const kAccessesPerGroupB =
+            (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) /
+            Base::kWarpGemmIterations;
+    };
+
+    static bool const kSmemContainsEntireMat = kMaxK <= Shape::kK * Stages;
+    static constexpr int kNumStagesConcurrentLoad = kSmemContainsEntireMat ? Stages : Stages - 1;
+
+private:
+    using WarpLoadedFragmentA = typename Operator::FragmentA;
+    using WarpLoadedFragmentB = typename Operator::FragmentB;
+    using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+    using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+private:
+    //
+    // Data members
+    //
+
+    /// Iterator to write threadblock-scoped tile of A operand to shared memory
+    SmemIteratorA smem_iterator_A_;
+
+    /// Iterator to write threadblock-scoped tile of B operand to shared memory
+    SmemIteratorB smem_iterator_B_;
+
+    bool prologue_done_;
+
+    // Set to `True` to ensure the accumulator will be zero outside the GEMM
+    // footprint
+    bool zero_outside_bounds_;
+
+public:
+    /// Construct from tensor references
+    CUTLASS_DEVICE
+    CustomMmaMultistage(
+        ///< Shared storage needed for internal use by threadblock-scoped GEMM
+        typename Base::SharedStorageA& shared_storageA,
+        typename Base::SharedStorageB& shared_storageB,
+        ///< ID within the threadblock
+        int thread_idx,
+        ///< ID of warp
+        int warp_idx,
+        ///< ID of each thread within a warp
+        int lane_idx)
+        : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx),
+          smem_iterator_A_(shared_storageA.ref(), thread_idx),
+          smem_iterator_B_(shared_storageB.ref(), thread_idx),
+          prologue_done_(false),
+          zero_outside_bounds_(false)
+    {
+        // Compute warp location within threadblock tile by mapping the warp_id to
+        // three coordinates:
+        //   _m: the warp's position within the threadblock along the M dimension
+        //   _n: the warp's position within the threadblock along the N dimension
+        //   _k: the warp's position within the threadblock along the K dimension
+
+        int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+        int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+        int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+        int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+        // Add per-warp offsets in units of warp-level tiles
+        this->warp_tile_iterator_A_.add_tile_offset(
+            {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+        this->warp_tile_iterator_B_.add_tile_offset(
+            {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+    }
+    CUTLASS_DEVICE
+    CustomMmaMultistage(
+        ///< Shared storage needed for internal use by threadblock-scoped GEMM
+        typename Base::SharedStorage& st,
+        ///< ID within the threadblock
+        int thread_idx,
+        ///< ID of warp
+        int warp_idx,
+        ///< ID of each thread within a warp
+        int lane_idx)
+        : CustomMmaMultistage(st.operand_A, st.operand_B, thread_idx, warp_idx, lane_idx)
+    {
+    }
+
+    CUTLASS_DEVICE
+    bool set_prologue_done(bool value) { prologue_done_ = value; }
+
+    CUTLASS_DEVICE
+    bool set_zero_outside_bounds(bool value) { zero_outside_bounds_ = value; }
+
+    template <bool kLoadA = true, bool kLoadB = true>
+    CUTLASS_DEVICE static void prologue(typename Base::SharedStorage& shared_storage,
+                                        ///< iterator over A operand in global memory
+                                        IteratorA iterator_A,
+                                        ///< iterator over B operand in global memory
+                                        IteratorB iterator_B,
+                                        int thread_idx,
+                                        int problem_size_k)
+    {
+        prologue<kLoadA, kLoadB>(shared_storage.operand_A,
+                                 shared_storage.operand_B,
+                                 iterator_A,
+                                 iterator_B,
+                                 thread_idx,
+                                 problem_size_k);
+    }
+
+    template <bool kLoadA = true, bool kLoadB = true>
+    CUTLASS_DEVICE static void prologue(typename Base::SharedStorageA& shared_storageA,
+                                        typename Base::SharedStorageB& shared_storageB,
+                                        ///< iterator over A operand in global memory
+                                        IteratorA iterator_A,
+                                        ///< iterator over B operand in global memory
+                                        IteratorB iterator_B,
+                                        int thread_idx,
+                                        int problem_size_k)
+    {
+        SmemIteratorA smem_iterator_A(shared_storageA.ref(), thread_idx);
+        SmemIteratorB smem_iterator_B(shared_storageB.ref(), thread_idx);
+        int32_t iter = (problem_size_k + Base::Shape::kK - 1) / Base::Shape::kK;
+        _prologue<kLoadA, kLoadB>(iterator_A, iterator_B, iter, smem_iterator_A, smem_iterator_B);
+    }
+
+    CUTLASS_DEVICE
+    void copy_tiles_and_advance(IteratorA& iterator_A,
+                                IteratorB& iterator_B,
+                                int group_start_A = 0,
+                                int group_start_B = 0)
+    {
+        iterator_A.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+        this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+        // Async Copy for operand A
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+            if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+                typename IteratorA::AccessType* dst_ptr =
+                    reinterpret_cast<typename IteratorA::AccessType*>(this->smem_iterator_A_.get());
+
+                int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                                      IteratorA::ThreadMap::kElementsPerAccess /
+                                      IteratorA::kAccessesPerVector / 8;
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+                    auto gmem_ptr = iterator_A.get();
+
+                    if (zero_outside_bounds_ ||
+                        SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+                        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                            dst_ptr + v, gmem_ptr, iterator_A.valid());
+                    } else {
+                        cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                            dst_ptr + v, gmem_ptr, iterator_A.valid());
+                    }
+
+                    ++iterator_A;
+                }
+
+                ++this->smem_iterator_A_;
+            }
+        }
+
+        iterator_B.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
+        this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+        // Async Copy for operand B
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+            if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+                typename IteratorB::AccessType* dst_ptr =
+                    reinterpret_cast<typename IteratorB::AccessType*>(this->smem_iterator_B_.get());
+
+                int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                                      IteratorB::ThreadMap::kElementsPerAccess /
+                                      IteratorB::kAccessesPerVector / 8;
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+                    auto gmem_ptr = iterator_B.get();
+
+                    if (zero_outside_bounds_ ||
+                        SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+                        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                            dst_ptr + v, gmem_ptr, iterator_B.valid());
+                    } else {
+                        cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                            dst_ptr + v, gmem_ptr, iterator_B.valid());
+                    }
+
+                    ++iterator_B;
+                }
+                ++this->smem_iterator_B_;
+            }
+        }
+    }
+
+    template <bool kLoadA = true, bool kLoadB = true>
+    CUTLASS_DEVICE static void _prologue(IteratorA& iterator_A,
+                                         IteratorB& iterator_B,
+                                         int32_t& gemm_k_iterations,
+                                         SmemIteratorA& smem_iterator_A_,
+                                         SmemIteratorB& smem_iterator_B_)
+    {
+        // Issue several complete stages
+        CUTLASS_PRAGMA_UNROLL
+        for (int stage = 0; stage < kNumStagesConcurrentLoad; ++stage, --gemm_k_iterations) {
+            iterator_A.clear_mask(gemm_k_iterations == 0);
+            iterator_B.clear_mask(gemm_k_iterations == 0);
+
+            iterator_A.set_iteration_index(0);
+            smem_iterator_A_.set_iteration_index(0);
+
+            // Async Copy for operand A
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+                typename IteratorA::AccessType* dst_ptr =
+                    reinterpret_cast<typename IteratorA::AccessType*>(smem_iterator_A_.get());
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+                    int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                                          IteratorA::ThreadMap::kElementsPerAccess /
+                                          IteratorA::kAccessesPerVector / 8;
+
+                    int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+                    if (kLoadA) {
+                        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                            dst_ptr + v, iterator_A.get(), iterator_A.valid());
+                    }
+
+                    ++iterator_A;
+                }
+
+                ++smem_iterator_A_;
+            }
+
+            iterator_B.set_iteration_index(0);
+            smem_iterator_B_.set_iteration_index(0);
+
+            // Async Copy for operand B
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+                typename IteratorB::AccessType* dst_ptr =
+                    reinterpret_cast<typename IteratorB::AccessType*>(smem_iterator_B_.get());
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+                    int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                                          IteratorB::ThreadMap::kElementsPerAccess /
+                                          IteratorB::kAccessesPerVector / 8;
+
+                    if (kLoadB) {
+                        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                            dst_ptr + v, iterator_B.get(), iterator_B.valid());
+                    }
+
+                    ++iterator_B;
+                }
+
+                ++smem_iterator_B_;
+            }
+
+            // Move to the next stage
+            iterator_A.add_tile_offset({0, 1});
+            iterator_B.add_tile_offset({1, 0});
+
+            smem_iterator_A_.add_tile_offset({0, 1});
+            smem_iterator_B_.add_tile_offset({1, 0});
+
+            // Defines the boundary of a stage of cp.async.
+            cutlass::arch::cp_async_fence();
+        }
+    }
+
+    /// Perform a threadblock-scoped matrix multiply-accumulate
+    CUTLASS_DEVICE
+    void operator()(
+        ///< problem size of GEMM
+        int gemm_k_iterations,
+        ///< destination accumulator tile
+        FragmentC& accum,
+        ///< iterator over A operand in global memory
+        IteratorA iterator_A,
+        ///< iterator over B operand in global memory
+        IteratorB iterator_B,
+        ///< initial value of accumulator
+        FragmentC const& src_accum)
+    {
+        //
+        // Prologue
+        //
+
+        if (!prologue_done_) {
+            _prologue<true, true>(
+                iterator_A, iterator_B, gemm_k_iterations, smem_iterator_A_, smem_iterator_B_);
+        } else if (!kSmemContainsEntireMat) {
+            _prologue<false, false>(
+                iterator_A, iterator_B, gemm_k_iterations, smem_iterator_A_, smem_iterator_B_);
+        } else {
+            gemm_k_iterations -= kNumStagesConcurrentLoad;
+        }
+
+        // Perform accumulation in the 'd' output operand
+        accum = src_accum;
+
+        //
+        // Clear the remaining tiles of SMEM. This is a functional requirement for
+        // some kernels so that all accumulator elements outside the GEMM footprint
+        // are zero.
+        //
+
+        if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+            /// Iterator to write threadblock-scoped tile of A operand to shared
+            /// memory
+            SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+            typename IteratorA::AccessType zero_A;
+            zero_A.clear();
+
+            last_smem_iterator_A.set_iteration_index(0);
+
+            // Async Copy for operand A
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+                typename IteratorA::AccessType* dst_ptr =
+                    reinterpret_cast<typename IteratorA::AccessType*>(last_smem_iterator_A.get());
+
+                *dst_ptr = zero_A;
+
+                ++last_smem_iterator_A;
+            }
+
+            /// Iterator to write threadblock-scoped tile of B operand to shared
+            /// memory
+            SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+            typename IteratorB::AccessType zero_B;
+
+            zero_B.clear();
+            last_smem_iterator_B.set_iteration_index(0);
+
+            // Async Copy for operand B
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+                typename IteratorB::AccessType* dst_ptr =
+                    reinterpret_cast<typename IteratorB::AccessType*>(last_smem_iterator_B.get());
+
+                *dst_ptr = zero_B;
+
+                ++last_smem_iterator_B;
+            }
+        }
+
+        // Waits until kStages-2 stages have committed.
+        cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+        __syncthreads();
+
+        // Pair of fragments used to overlap shared memory loads and math
+        // instructions
+        WarpLoadedFragmentA warp_loaded_frag_A[2];
+        WarpLoadedFragmentB warp_loaded_frag_B[2];
+        WarpTransformedFragmentA warp_transformed_frag_A[2];
+        WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+        Operator warp_mma;
+
+        this->warp_tile_iterator_A_.set_kgroup_index(0);
+        this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        iterator_A.clear_mask(gemm_k_iterations == 0);
+        iterator_B.clear_mask(gemm_k_iterations == 0);
+
+        int smem_write_stage_idx = Base::kStages - 1;
+        int smem_read_stage_idx = 0;
+
+        warp_mma.transform(warp_transformed_frag_A[0],
+                           warp_transformed_frag_B[0],
+                           warp_loaded_frag_A[0],
+                           warp_loaded_frag_B[0]);
+
+        // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+        // accumulator and this temporary accumulator is added to the final
+        // accumulator once in every mainloop iteration.
+        plus<FragmentC> plus_accum;
+
+        FragmentC tmp_accum;
+
+        if (platform::is_same<typename Operator::MathOperator, arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<typename Operator::MathOperator,
+                              arch::OpMultiplyAddComplexFastF32>::value) {
+            tmp_accum.clear();
+        }
+
+        //
+        // Mainloop
+        //
+
+        CUTLASS_GEMM_LOOP
+        for (; gemm_k_iterations > (-kNumStagesConcurrentLoad);) {
+            //
+            // Loop over GEMM K dimension
+            //
+
+            // Computes a warp-level GEMM on data held in shared memory
+            // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+            CUTLASS_PRAGMA_UNROLL
+            for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+                // Load warp-level tiles from shared memory, wrapping to k offset if
+                // this is the last group as the case may be.
+
+                this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) %
+                                                             Base::kWarpGemmIterations);
+                this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) %
+                                                             Base::kWarpGemmIterations);
+
+                // In case of a non-circular buffer ("kSmemContainsEntireMat")
+                // make sure we don't load out of bounds data.
+                if (!kSmemContainsEntireMat || gemm_k_iterations > (-kNumStagesConcurrentLoad) ||
+                    warp_mma_k < Base::kWarpGemmIterations - 1) {
+                    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+                    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+                }
+
+                ++this->warp_tile_iterator_A_;
+                ++this->warp_tile_iterator_B_;
+
+                if (warp_mma_k > 0)
+                    warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                                       warp_transformed_frag_B[warp_mma_k % 2],
+                                       warp_loaded_frag_A[warp_mma_k % 2],
+                                       warp_loaded_frag_B[warp_mma_k % 2]);
+
+                if (platform::is_same<typename Operator::MathOperator,
+                                      arch::OpMultiplyAddFastF32>::value ||
+                    platform::is_same<typename Operator::MathOperator,
+                                      arch::OpMultiplyAddComplexFastF32>::value) {
+                    warp_mma(tmp_accum,
+                             warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             tmp_accum);
+
+                    if (warp_mma_k == 0) {
+                        accum = plus_accum(accum, tmp_accum);
+                        tmp_accum.clear();
+                    }
+                } else {
+                    warp_mma(accum,
+                             warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             accum);
+                }
+
+                // Issue global->shared copies for the this stage
+                if (!kSmemContainsEntireMat && warp_mma_k < Base::kWarpGemmIterations - 1) {
+                    int group_start_iteration_A, group_start_iteration_B;
+
+                    group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+                    group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+                    copy_tiles_and_advance(
+                        iterator_A, iterator_B, group_start_iteration_A, group_start_iteration_B);
+                }
+
+                if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+                    if (!kSmemContainsEntireMat) {
+                        int group_start_iteration_A, group_start_iteration_B;
+                        group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+                        group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+                        copy_tiles_and_advance(iterator_A,
+                                               iterator_B,
+                                               group_start_iteration_A,
+                                               group_start_iteration_B);
+                    }
+
+                    // Inserts a memory fence between stages of cp.async instructions.
+                    cutlass::arch::cp_async_fence();
+
+                    // Waits until kStages-2 stages have committed.
+                    cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+                    __syncthreads();
+
+                    // Move to the next stage
+                    iterator_A.add_tile_offset({0, 1});
+                    iterator_B.add_tile_offset({1, 0});
+
+                    this->smem_iterator_A_.add_tile_offset({0, 1});
+                    this->smem_iterator_B_.add_tile_offset({1, 0});
+
+                    // Add negative offsets to return iterators to the 'start' of the
+                    // circular buffer in shared memory
+                    if (smem_write_stage_idx == (Base::kStages - 1)) {
+                        this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+                        this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+                        smem_write_stage_idx = 0;
+                    } else {
+                        ++smem_write_stage_idx;
+                    }
+
+                    if (!kSmemContainsEntireMat && smem_read_stage_idx == (Base::kStages - 1)) {
+                        this->warp_tile_iterator_A_.add_tile_offset(
+                            {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+                        this->warp_tile_iterator_B_.add_tile_offset(
+                            {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+                        smem_read_stage_idx = 0;
+                    } else {
+                        ++smem_read_stage_idx;
+                    }
+
+                    --gemm_k_iterations;
+                    iterator_A.clear_mask(gemm_k_iterations == 0);
+                    iterator_B.clear_mask(gemm_k_iterations == 0);
+                }
+
+                // Do any conversions feeding the first stage at the end of the loop so
+                // we can start right away on mma instructions
+                if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+                    warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                                       warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                                       warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                                       warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+            }
+        }
+
+        if (platform::is_same<typename Operator::MathOperator, arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<typename Operator::MathOperator,
+                              arch::OpMultiplyAddComplexFastF32>::value) {
+            accum = plus_accum(accum, tmp_accum);
+        }
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_pipelined.h b/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_pipelined.h
new file mode 100644
index 000000000000..07b26ca31299
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/gemm/custom_mma_pipelined.h
@@ -0,0 +1,388 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "custom_mma_base.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to A operand
+    typename TransformA_ = NumericArrayConverter<typename SmemIteratorA_::Element,
+                                                 typename IteratorA_::Element,
+                                                 IteratorA_::Fragment::kElements>,
+    ///
+    /// Transformation applied to B operand
+    typename TransformB_ = NumericArrayConverter<typename SmemIteratorB_::Element,
+                                                 typename IteratorB_::Element,
+                                                 IteratorB_::Fragment::kElements>,
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaPipelined : public CustomMmaBase<Shape_, Policy_, 2> {
+public:
+    ///< Base class
+    using Base = CustomMmaBase<Shape_, Policy_, 2>;
+
+    using Shape = Shape_;          ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+    using IteratorA = IteratorA_;  ///< Iterates over tiles of A operand in global memory
+    using IteratorB = IteratorB_;  ///< Iterates over tiles of B operand in global memory
+    using ElementC = ElementC_;    ///< Data type of accumulator matrix
+    using LayoutC = LayoutC_;      ///< Layout of accumulator matrix
+    using Policy = Policy_;        ///< Policy describing tuning details
+
+    using SmemIteratorA = SmemIteratorA_;
+    using SmemIteratorB = SmemIteratorB_;
+
+    using TransformA = TransformA_;
+    using TransformB = TransformB_;
+
+    //
+    // Dependent types
+    //
+
+    /// Fragment of operand A loaded from global memory
+    using FragmentA = typename IteratorA::Fragment;
+
+    /// Fragment of operand B loaded from global memory
+    using FragmentB = typename IteratorB::Fragment;
+
+    /// Fragment of accumulator tile
+    using FragmentC = typename Policy::Operator::FragmentC;
+
+    /// Warp-level Mma
+    using Operator = typename Policy::Operator;
+
+    /// Obtain the arch tag from the warp-level operator
+    using ArchTag = typename Policy::Operator::ArchTag;
+
+    /// Complex transform on A operand
+    static ComplexTransform const kTransformA = Operator::kTransformA;
+
+    /// Complex transform on B operand
+    static ComplexTransform const kTransformB = Operator::kTransformB;
+
+    // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+    static_assert((Base::kStages == 2), "MmaPipelined requires kStages set to value 2");
+
+    static bool const kSmemContainsEntireMat = false;
+
+private:
+    using WarpFragmentA = typename Operator::FragmentA;
+    using WarpFragmentB = typename Operator::FragmentB;
+
+protected:
+    /// Iterator to write threadblock-scoped tile of A operand to shared memory
+    SmemIteratorA smem_iterator_A_;
+
+    /// Iterator to write threadblock-scoped tile of B operand to shared memory
+    SmemIteratorB smem_iterator_B_;
+
+public:
+    /// Construct from tensor references
+    CUTLASS_DEVICE
+    CustomMmaPipelined(typename Base::SharedStorageA& shared_storageA,
+                       typename Base::SharedStorageB& shared_storageB,
+                       int thread_idx,  ///< ID within the threadblock
+                       int warp_idx,    ///< ID of warp
+                       int lane_idx     ///< ID of each thread within a warp
+                       )
+        : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx),
+          smem_iterator_A_(shared_storageA.ref(), thread_idx),
+          smem_iterator_B_(shared_storageB.ref(), thread_idx)
+    {
+        // Compute warp location within threadblock tile by mapping the warp_id to
+        // three coordinates:
+        //   _m: the warp's position within the threadblock along the M dimension
+        //   _n: the warp's position within the threadblock along the N dimension
+        //   _k: the warp's position within the threadblock along the K dimension
+
+        int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+        int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+        int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+        int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+        // Add per-warp offsets in units of warp-level tiles
+        this->warp_tile_iterator_A_.add_tile_offset(
+            {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+        this->warp_tile_iterator_B_.add_tile_offset(
+            {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+    }
+    CUTLASS_DEVICE
+    CustomMmaPipelined(
+        ///< Shared storage needed for internal use by threadblock-scoped GEMM
+        typename Base::SharedStorage& st,
+        ///< ID within the threadblock
+        int thread_idx,
+        ///< ID of warp
+        int warp_idx,
+        ///< ID of each thread within a warp
+        int lane_idx)
+        : CustomMmaPipelined(st.operand_A, st.operand_B, thread_idx, warp_idx, lane_idx)
+    {
+    }
+
+    CUTLASS_DEVICE
+    bool set_prologue_done(bool value)
+    {
+        // NOT IMPLEMENTED FOR PIPELINED
+    }
+
+    CUTLASS_DEVICE
+    bool set_zero_outside_bounds(bool value)
+    {
+        // NOT NEEDED FOR PIPELINED
+        // shared memory will always be zero-filled
+    }
+
+    template <bool kLoadA = true, bool kLoadB = true>
+    CUTLASS_DEVICE static void prologue(typename Base::SharedStorage& shared_storage,
+                                        ///< iterator over A operand in global memory
+                                        IteratorA iterator_A,
+                                        ///< iterator over B operand in global memory
+                                        IteratorB iterator_B,
+                                        int thread_idx,
+                                        int problem_size_k)
+    {
+        prologue<kLoadA, kLoadB>(shared_storage.operand_A,
+                                 shared_storage.operand_B,
+                                 iterator_A,
+                                 iterator_B,
+                                 thread_idx,
+                                 problem_size_k);
+    }
+
+    template <bool kLoadA = true, bool kLoadB = true>
+    CUTLASS_DEVICE static void prologue(typename Base::SharedStorageA& shared_storageA,
+                                        typename Base::SharedStorageB& shared_storageB,
+                                        ///< iterator over A operand in global memory
+                                        IteratorA iterator_A,
+                                        ///< iterator over B operand in global memory
+                                        IteratorB iterator_B,
+                                        int thread_idx,
+                                        int problem_size_k)
+    {
+        // NOT IMPLEMENTED FOR PIPELINED
+    }
+
+    /// Perform a threadblock-scoped matrix multiply-accumulate
+    CUTLASS_DEVICE
+    void operator()(
+        int gemm_k_iterations,                  ///< number of iterations of the mainloop
+        FragmentC& accum,                       ///< destination accumulator tile
+        IteratorA iterator_A,                   ///< iterator over A operand in global memory
+        IteratorB iterator_B,                   ///< iterator over B operand in global memory
+        FragmentC const& src_accum,             ///< source accumulator tile
+        TransformA transform_A = TransformA(),  ///< transformation applied to A fragment
+        TransformB transform_B = TransformB())
+    {  ///< transformation applied to B fragment
+
+        //
+        // Prologue
+        //
+
+        // Perform accumulation in the 'd' output operand
+        accum = src_accum;
+
+        FragmentA tb_frag_A;
+        FragmentB tb_frag_B;
+
+        tb_frag_A.clear();
+        tb_frag_B.clear();
+
+        // The last kblock is loaded in the prolog
+        iterator_A.load(tb_frag_A);
+        iterator_B.load(tb_frag_B);
+
+        ++iterator_A;
+        ++iterator_B;
+
+        this->smem_iterator_A_.store(transform_A(tb_frag_A));
+        this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+        ++this->smem_iterator_A_;
+        ++this->smem_iterator_B_;
+
+        __syncthreads();
+
+        // Pair of fragments used to overlap shared memory loads and math
+        // instructions
+        WarpFragmentA warp_frag_A[2];
+        WarpFragmentB warp_frag_B[2];
+
+        this->warp_tile_iterator_A_.set_kgroup_index(0);
+        this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+        this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        Operator warp_mma;
+
+        int smem_write_stage_idx = 1;
+
+        // Avoid reading out of bounds
+        iterator_A.clear_mask(gemm_k_iterations <= 1);
+        iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+        // Issue loads during the first warp-level matrix multiply-add *AFTER*
+        // issuing shared memory loads (which have the tightest latency requirement).
+
+        //
+        // Mainloop
+        //
+
+        // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+        CUTLASS_GEMM_LOOP
+        for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+            //
+            // Loop over GEMM K dimension
+            //
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+                // Load warp-level tiles from shared memory, wrapping to k offset if
+                // this is the last group as the case may be.
+
+                if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+                    // Write fragments to shared memory
+                    this->smem_iterator_A_.store(transform_A(tb_frag_A));
+
+                    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+                    __syncthreads();
+
+                    ++this->smem_iterator_A_;
+                    ++this->smem_iterator_B_;
+
+                    // Add negative offsets to return iterators to the 'start' of the
+                    // circular buffer in shared memory
+                    if (smem_write_stage_idx == 1) {
+                        this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+                        this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+                    } else {
+                        this->warp_tile_iterator_A_.add_tile_offset(
+                            {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+                        this->warp_tile_iterator_B_.add_tile_offset(
+                            {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+                    }
+
+                    smem_write_stage_idx ^= 1;
+                }
+
+                this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) %
+                                                             Base::kWarpGemmIterations);
+                this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) %
+                                                             Base::kWarpGemmIterations);
+
+                this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+                this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+                ++this->warp_tile_iterator_A_;
+                ++this->warp_tile_iterator_B_;
+
+                if (warp_mma_k == 0) {
+                    iterator_A.load(tb_frag_A);
+                    iterator_B.load(tb_frag_B);
+
+                    ++iterator_A;
+                    ++iterator_B;
+
+                    // Avoid reading out of bounds if this was the last loop iteration
+                    iterator_A.clear_mask(gemm_k_iterations <= 2);
+                    iterator_B.clear_mask(gemm_k_iterations <= 2);
+                }
+
+                warp_mma(accum, warp_frag_A[warp_mma_k % 2], warp_frag_B[warp_mma_k % 2], accum);
+            }
+        }
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm/find_default_mma.h b/csrc/deepspeed4science/evoformer_attn/gemm/find_default_mma.h
new file mode 100644
index 000000000000..163dcbf85259
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/gemm/find_default_mma.h
@@ -0,0 +1,191 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+    \brief Cutlass provides helper template functions to figure out the right
+   datastructures to instantiate to run a GEMM with various parameters (see
+   `cutlass/gemm/threadblock/default_mma.h`). However, due to template
+   instantiation priority rules, it will only create an MmaMultiStage with
+   kStages=3 (otherwise creates an MmePipelined - which is not compatible with
+   FastF32). kStages=3 uses too much shared memory and we want to use kStages=2,
+   so we just copy-pasted some code from `default_mma.h` and
+   `default_mma_core.h` files and wrapped this template to allow our usecase.
+
+    This is really only for the FastF32 case - aka using TensorCores with fp32.
+*/
+
+#pragma once
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    typename Enable_ = void>
+struct FindDefaultMma {
+    static constexpr bool AccumulatorsInRowMajor = false;
+    static constexpr SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone;
+    using DefaultMma = cutlass::gemm::threadblock::DefaultMma<ElementA,
+                                                              LayoutA,
+                                                              kAlignmentA,
+                                                              ElementB,
+                                                              LayoutB,
+                                                              kAlignmentB,
+                                                              ElementAccumulator,
+                                                              LayoutC,
+                                                              OperatorClass,
+                                                              ArchTag,
+                                                              ThreadblockShape,
+                                                              WarpShape,
+                                                              InstructionShape,
+                                                              Stages,
+                                                              Operator,
+                                                              AccumulatorsInRowMajor,
+                                                              SharedMemoryClear>;
+};
+
+/// Specialization for sm80 / FastF32 / multistage with kStages=2
+template <typename ElementA_,
+          /// Layout type for A matrix operand
+          typename LayoutA_,
+          /// Access granularity of A matrix in units of elements
+          int kAlignmentA,
+          typename ElementB_,
+          /// Layout type for B matrix operand
+          typename LayoutB_,
+          /// Access granularity of B matrix in units of elements
+          int kAlignmentB,
+          typename ElementAccumulator,
+          /// Threadblock-level tile size (concept: GemmShape)
+          typename ThreadblockShape,
+          /// Warp-level tile size (concept: GemmShape)
+          typename WarpShape,
+          /// Instruction-level tile size (concept: GemmShape)
+          typename InstructionShape,
+          int kStages,
+          typename Operator>
+struct FindDefaultMma<ElementA_,
+                      LayoutA_,
+                      kAlignmentA,
+                      ElementB_,
+                      LayoutB_,
+                      kAlignmentB,
+                      ElementAccumulator,
+                      layout::RowMajor,
+                      arch::OpClassTensorOp,
+                      arch::Sm80,
+                      ThreadblockShape,
+                      WarpShape,
+                      InstructionShape,
+                      kStages,
+                      Operator,
+                      typename cutlass::platform::enable_if<(kAlignmentA > 1)>::type> {
+    using LayoutC = layout::RowMajor;
+    using OperatorClass = arch::OpClassTensorOp;
+    using ArchTag = arch::Sm80;
+
+    using DefaultMma_ = cutlass::gemm::threadblock::DefaultMma<ElementA_,
+                                                               LayoutA_,
+                                                               kAlignmentA,
+                                                               ElementB_,
+                                                               LayoutB_,
+                                                               kAlignmentB,
+                                                               ElementAccumulator,
+                                                               LayoutC,
+                                                               OperatorClass,
+                                                               ArchTag,
+                                                               ThreadblockShape,
+                                                               WarpShape,
+                                                               InstructionShape,
+                                                               3,
+                                                               Operator>;
+    struct DefaultMma : DefaultMma_ {
+        using MmaCore_ = typename DefaultMma_::MmaCore;
+        // Define the threadblock-scoped multistage matrix multiply
+        using ThreadblockMma =
+            cutlass::gemm::threadblock::MmaMultistage<typename MmaCore_::Shape,
+                                                      typename DefaultMma_::IteratorA,
+                                                      typename MmaCore_::SmemIteratorA,
+                                                      MmaCore_::kCacheOpA,
+                                                      typename DefaultMma_::IteratorB,
+                                                      typename MmaCore_::SmemIteratorB,
+                                                      MmaCore_::kCacheOpB,
+                                                      ElementAccumulator,
+                                                      LayoutC,
+                                                      typename MmaCore_::MmaPolicy,
+                                                      kStages>;
+    };
+};
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm/mma_accum_lambda_iterator.h b/csrc/deepspeed4science/evoformer_attn/gemm/mma_accum_lambda_iterator.h
new file mode 100644
index 000000000000..5e2f0cf681bf
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/gemm/mma_accum_lambda_iterator.h
@@ -0,0 +1,347 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "cutlass/functional.h"
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+#include "cutlass/matrix_shape.h"
+
+/*
+TensorCores have different accumulator layouts.
+This file provides a class to easily map the accumulator
+i-th element with the corresponding matrix row/col.
+*/
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AccumLambdaIteratorSm80 {
+    static_assert(cutlass::platform::is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+                  "only RowMajor is supported");
+
+    using Policy = typename T::Policy;
+    using InstructionShape = typename T::InstructionShape;
+    using OpDelta = typename T::OpDelta;
+    using Shape = typename T::Shape;
+    static int const kElementsPerAccess = InstructionShape::kN / 4;
+    static int const kRowsPerTile = 8;
+    static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+    static cutlass::MatrixCoord CUTLASS_DEVICE
+    get_lane_offset(int8_t lane_id, int8_t warp_id, typename T::TensorCoord const& tile_offset)
+    {
+        int quad = (lane_id >> 2);
+        int lane_in_quad = (lane_id & 3);
+        return cutlass::MatrixCoord(
+            quad + tile_offset.row() * Shape::kRow,
+            lane_in_quad * kElementsPerAccess + tile_offset.column() * Shape::kColumn);
+    }
+
+    template <typename FA, typename FB, typename FC>
+    CUTLASS_DEVICE static void iterateRows(cutlass::MatrixCoord& lane_offset,
+                                           FA beginRow,
+                                           FB op,
+                                           FC endRow)
+    {
+        // See cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int row = 0; row < kAccumulatorRows; ++row) {
+                int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow + row * kRowsPerTile +
+                              lane_offset.row();
+                beginRow(accum_m);
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+                    int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
+                                          (mma_n * Policy::MmaIterations::kRow + mma_m);
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int col = 0; col < kElementsPerAccess; ++col) {
+                        int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col +
+                                      lane_offset.column();
+                        int idx = mma_accum_start + row * kElementsPerAccess + col;
+                        op(accum_m, accum_n, idx);
+                    }
+                }
+
+                endRow(accum_m);
+            }
+        }
+    }
+
+    template <typename DT, typename F>
+    CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn)
+    {
+        // In each warp, 4 threads will work on the same row
+        // - the ones with the same `quad`
+        auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1);
+        myValue = fn(myValue, otherV);
+        otherV = __shfl_xor_sync(0xffffffff, myValue, 2);
+        myValue = fn(myValue, otherV);
+        int lane_in_quad = (lane_id & 3);
+        return lane_in_quad == 0;
+    }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AccumLambdaIteratorSm70 {
+    static_assert(cutlass::platform::is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+                  "only RowMajor is supported");
+
+    using Policy = typename T::Policy;
+    using InstructionShape = typename T::InstructionShape;
+    using OpDelta = typename T::OpDelta;
+    using Shape = typename T::Shape;
+    using Element = accum_t;
+
+    static int const kElementsPerPartial = 4;
+    using EleShapePerPatial =
+        typename cutlass::platform::conditional<cutlass::platform::is_same<Element, float>::value,
+                                                cutlass::MatrixShape<2, 2>,
+                                                cutlass::MatrixShape<1, 4>>::type;
+    static int const kElementsPerMma = 8;
+    static int const kAccumulatorPatials = 2;
+    using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+    static cutlass::MatrixCoord CUTLASS_DEVICE
+    get_lane_offset(int8_t lane_id, int8_t warp_id, typename T::TensorCoord const& tile_offset)
+    {
+        int quad = (lane_id >> 2);
+        int lane_in_quad = (lane_id & 3);
+        int accum_m, accum_n;
+
+        if (cutlass::platform::is_same<Element, float>::value) {
+            // (quad[2],quad[0])+lane_in_quad[0]
+            accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+            // (quad[1])+lane_in_quad[1]
+            accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+                      (lane_in_quad & 2);
+        } else {
+            accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + lane_in_quad;  // (quad[2],quad[0])
+            accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+        }
+        return cutlass::MatrixCoord(accum_m + tile_offset.row() * Shape::kRow,
+                                    accum_n + tile_offset.column() * Shape::kColumn);
+    }
+
+    template <typename DT, typename F>
+    CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn)
+    {
+        static_assert(cutlass::platform::is_same<Element, float>::value,
+                      "update to support non-float accum");
+        // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-884-f16
+        // T0 & T2 share same line within a quad
+        auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 1);
+        myValue = fn(myValue, otherV);
+        // quad 0 and quad 2 are on the same lines
+        otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 3);
+        myValue = fn(myValue, otherV);
+        return (lane_id & ((1 << 1) | (1 << 3))) == 0;
+    }
+
+    template <typename FA, typename FB, typename FC>
+    CUTLASS_DEVICE static void iterateRows(cutlass::MatrixCoord& lane_offset,
+                                           FA beginRow,
+                                           FB op,
+                                           FC endRow)
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+                    int accum_m = tile_m * Policy::InterleavedTile::kRow +
+                                  mma_m * QuadShapePerPatialMma::kRow + m * 2 + lane_offset.row();
+                    beginRow(accum_m);
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+                        CUTLASS_PRAGMA_UNROLL
+                        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+                            CUTLASS_PRAGMA_UNROLL
+                            for (int p = 0; p < kAccumulatorPatials; ++p) {
+                                CUTLASS_PRAGMA_UNROLL
+                                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                                    int mma_accum_start =
+                                        (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                                              Policy::MmaIterations::kColumn +
+                                          mma_n) *
+                                             Policy::MmaIterations::kRow +
+                                         mma_m) *
+                                        kElementsPerMma;
+                                    int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                                                  mma_n * QuadShapePerPatialMma::kColumn +
+                                                  p * Policy::InterleavedTile::kColumn / 2 + n +
+                                                  lane_offset.column();
+                                    int idx = mma_accum_start + p * kElementsPerPartial +
+                                              m * EleShapePerPatial::kColumn + n;
+                                    op(accum_m, accum_n, idx);
+                                }
+                            }
+                        }
+                    }
+                    endRow(accum_m);
+                }
+            }
+        }
+    }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AccumLambdaIteratorSimt {
+    using Policy = typename T::Policy;
+    using Iterations = typename T::Iterations;
+    using Element = typename T::Element;
+    using Delta = typename T::Delta;
+    using Shape = typename T::Shape;
+    static_assert(cutlass::platform::is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+                  "only RowMajor is supported");
+
+    template <typename DT, typename F>
+    CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn)
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int bit = 1; bit < Policy::WarpShape::kColumn; bit *= 2) {
+            auto otherV = __shfl_xor_sync(0xffffffff, myValue, bit);
+            myValue = fn(myValue, otherV);
+        }
+        return (lane_id & (Policy::WarpShape::kColumn - 1)) == 0;
+    }
+
+    template <typename FA, typename FB, typename FC>
+    CUTLASS_DEVICE static void iterateRows(cutlass::MatrixCoord& lane_offset,
+                                           FA beginRow,
+                                           FB op,
+                                           FC endRow)
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+                int accum_m = mma_m * Delta::kRow + m + lane_offset.row();
+                beginRow(accum_m);
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+                    int accum_n = mma_n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN +
+                                  lane_offset.column();
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+                        int idx = n + Policy::LaneMmaShape::kN *
+                                          (mma_n + Iterations::kColumn *
+                                                       (m + mma_m * Policy::LaneMmaShape::kM));
+                        op(accum_m, accum_n + n, idx);
+                    }
+                }
+                endRow(accum_m);
+            }
+        }
+    }
+
+    static cutlass::MatrixCoord CUTLASS_DEVICE
+    get_lane_offset(int8_t lane_id, int8_t warp_id, typename T::TensorCoord const& tile_offset)
+    {
+        static_assert(cutlass::platform::is_same<typename Policy::LaneLayout,
+                                                 cutlass::layout::RowMajorInterleaved<1>>::value,
+                      "");
+        typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+        cutlass::MatrixCoord lane_offset =
+            lane_layout.inverse(lane_id) *
+            cutlass::MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
+        return lane_offset + tile_offset * cutlass::MatrixCoord(Shape::kRow, Shape::kColumn);
+    }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator;
+
+// Simt
+template <typename S, typename P, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator<
+    cutlass::gemm::warp::MmaSimtTileIterator<S,
+                                             cutlass::gemm::Operand::kC,
+                                             accum_t,
+                                             cutlass::layout::RowMajor,
+                                             P,
+                                             1,
+                                             1>,
+    accum_t,
+    kWarpSize> {
+    using WarpIterator =
+        typename cutlass::gemm::warp::MmaSimtTileIterator<S,
+                                                          cutlass::gemm::Operand::kC,
+                                                          accum_t,
+                                                          cutlass::layout::RowMajor,
+                                                          P,
+                                                          1,
+                                                          1>;
+    using Iterator = AccumLambdaIteratorSimt<WarpIterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename S1, typename S2, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator<
+    cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<S1,
+                                                                 accum_t,
+                                                                 cutlass::layout::RowMajor,
+                                                                 S2,
+                                                                 cutlass::MatrixShape<1, 1>>,
+    accum_t,
+    kWarpSize> {
+    using WarpIterator = typename cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        S1,
+        accum_t,
+        cutlass::layout::RowMajor,
+        S2,
+        cutlass::MatrixShape<1, 1>>;
+    using Iterator = AccumLambdaIteratorSm70<WarpIterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Sm75+
+template <typename S1, typename S2, typename S3, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator<
+    cutlass::gemm::warp::
+        MmaTensorOpAccumulatorTileIterator<S1, accum_t, cutlass::layout::RowMajor, S2, S3>,
+    accum_t,
+    kWarpSize> {
+    using WarpIterator = typename cutlass::gemm::warp::
+        MmaTensorOpAccumulatorTileIterator<S1, accum_t, cutlass::layout::RowMajor, S2, S3>;
+    using Iterator = AccumLambdaIteratorSm80<WarpIterator, accum_t, kWarpSize>;
+};
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm/mma_from_smem.h b/csrc/deepspeed4science/evoformer_attn/gemm/mma_from_smem.h
new file mode 100644
index 000000000000..40d3265c7a63
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/gemm/mma_from_smem.h
@@ -0,0 +1,1939 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/functional.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+
+#include "../epilogue/epilogue_thread_apply_logsumexp.h"
+#include "../gemm/mma_accum_lambda_iterator.h"
+#include "../gemm_kernel_utils.h"
+#include "../iterators/make_residual_last.h"
+#include "../iterators/transpose_warp_iterator.h"
+#include "../iterators/warp_iterator_from_smem.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/// Shared storage object needed by accumulator
+/// From 13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+template <typename Shape_, typename Element_, typename Layout_, typename Padding_>
+class AccumulatorSharedStorage {
+public:
+    //
+    // Type definitions
+    //
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = Layout_;
+    using Padding = Padding_;
+
+    /// Tensor reference to the accumulator
+    using TensorRefAccum = cutlass::TensorRef<Element, Layout>;
+
+    /// Shape of the accumulator matrix in shared memory
+    using ShapeAccum =
+        cutlass::MatrixShape<Shape::kM + Padding::kRow, Shape::kN + Padding::kColumn>;
+
+public:
+    //
+    // Data members
+    //
+
+    /// Buffer for accumulator
+    cutlass::AlignedBuffer<Element, ShapeAccum::kCount> accum;
+
+public:
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the Accum matrix
+    CUTLASS_DEVICE
+    static Layout LayoutAccum() { return Layout::packed({ShapeAccum::kRow, ShapeAccum::kColumn}); }
+
+    /// Returns a TensorRef to the Accumulator
+    CUTLASS_HOST_DEVICE
+    TensorRefAccum accum_ref() { return TensorRefAccum{accum.data(), LayoutAccum()}; }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // Maximum value for K
+    int kMaxK,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaBaseFromSharedMemory {
+public:
+    ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+    using Shape = Shape_;
+
+    ///< Policy describing tuning details
+    using Policy = Policy_;
+
+    //
+    // Dependent types
+    //
+
+    /// Warp-level Mma
+    using Operator = typename Policy::Operator;
+
+    /// Shape describing the overall GEMM computed from shared memory
+    /// by each warp.
+    using WarpGemm = typename Policy::Operator::Shape;
+
+    /// Shape describing the number of warps filling the CTA
+    using WarpCount =
+        GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
+    using WarpCount1 = WarpCount;
+
+    /// Number of warp-level GEMM oeprations
+    static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+    static int const kWarpGemmIterations1 = kWarpGemmIterations;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// If this is true, we fill the entire shmem buffer at start
+    /// and don't need to iterate through it in a circular fashion
+    static bool const kSmemContainsEntireB = kMaxK <= Shape::kK * kStages;
+
+    /// Tensor reference to the A operand
+    using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+    /// Tensor reference to the B operand
+    using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+    //
+    // Nested structs
+    //
+
+    /// Shared storage object needed by threadblock-scoped GEMM
+    class SharedStorage {
+    public:
+        //
+        // Type definitions
+        //
+
+        /// Shape of the B matrix operand in shared memory
+        using ShapeB = MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                                   Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+    public:
+        //
+        // Data members
+        //
+
+        /// Buffer for B operand
+        AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+    public:
+        //
+        // Methods
+        //
+
+        /// Returns a layout object for the B matrix
+        CUTLASS_HOST_DEVICE
+        static typename Operator::LayoutB LayoutB()
+        {
+            return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+        }
+
+        /// Returns a TensorRef to the B operand
+        CUTLASS_HOST_DEVICE
+        TensorRefB operand_B_ref() { return TensorRefB{operand_B.data(), LayoutB()}; }
+    };
+
+protected:
+    //
+    // Data members
+    //
+
+    // /// Iterator to load a warp-scoped tile of A operand from shared memory
+    // typename Operator::IteratorA warp_tile_iterator_A_;
+
+    /// Iterator to load a warp-scoped tile of B operand from shared memory
+    typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+    /// Construct from tensor references
+    CUTLASS_DEVICE
+    MmaBaseFromSharedMemory(
+        ///< Shared storage needed for internal use by threadblock-scoped GEMM
+        SharedStorage& shared_storage,
+        ///< ID within the threadblock
+        int thread_idx,
+        ///< ID of warp
+        int warp_idx,
+        ///< ID of each thread within a warp
+        int lane_idx)
+        : warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx)
+    {
+    }
+};
+
+namespace {
+
+// has necessary trait compliance with WarpIteratorFromSmem but doesn't do
+// anything, can be default initialized, and uses fragment that takes up
+// (almost) no space. this warp iterator is selected at compile time when
+// elementwise on-the-fly scaling for operand A is disabled, in which case
+// operations related to loading scale factors for operand A get wiped out by
+// the compiler.
+template <typename TensorRef>
+class NoOpWarpIteratorScale {
+public:
+    // in pipelined+multistage MMA implementations we keep an array of fragments.
+    // if we aren't using scaling we don't want to waste registers on fragments
+    // of scale elements, so ideally this would be sized 0.
+    // Since arrays of zero-sized objects are not allowed, using size as 1.
+    // The compiler will most likely wipe it out anyways.
+    using Fragment = cutlass::Array<char, 1>;
+
+    CUTLASS_HOST_DEVICE
+    NoOpWarpIteratorScale() {}
+
+    CUTLASS_HOST_DEVICE
+    NoOpWarpIteratorScale(TensorRef const&, int) {}
+
+    CUTLASS_HOST_DEVICE
+    NoOpWarpIteratorScale& add_tile_offset(typename TensorRef::TensorCoord const&) { return *this; }
+
+    CUTLASS_HOST_DEVICE
+    NoOpWarpIteratorScale& operator++() { return *this; }
+
+    CUTLASS_DEVICE
+    void load(Fragment&) const {}
+};
+
+// if scaling is enabled, performs fragment elementwise multiplication between
+// fragment and its scaling factor.
+template <typename Fragment, typename FragmentScale, bool ScalingEnabled>
+class FragmentElementwiseScaler;
+
+// specialization for scaling being enabled.
+template <typename Fragment, typename FragmentScale>
+class FragmentElementwiseScaler<Fragment, FragmentScale, true> {
+public:
+    // cast scale_frag to correct type then apply elementwise to fragment
+    CUTLASS_DEVICE
+    static Fragment apply(Fragment frag, FragmentScale const& scale_frag)
+    {
+        Fragment converted_scale_frag =
+            cutlass::NumericArrayConverter<typename Fragment::Element,
+                                           typename FragmentScale::Element,
+                                           FragmentScale::kElements>()(scale_frag);
+        return cutlass::multiplies<Fragment>()(frag, converted_scale_frag);
+    }
+};
+
+// specialization for scaling being disabled. doesn't do anything and should
+// just get wiped out by the compiler.
+template <typename Fragment, typename FragmentScale>
+class FragmentElementwiseScaler<Fragment, FragmentScale, false> {
+public:
+    CUTLASS_DEVICE
+    static Fragment apply(Fragment frag, FragmentScale const&) { return frag; }
+};
+}  // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // BEGIN smem
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA,
+    /// whether or not to perform elementwise multiplication of A
+    //  by another matrix (A_scale) that is also kept in shared memory prior
+    //  to matmul A @ B
+    bool ScaleOperandA_,
+    // Accumulator type
+    typename AccumulatorSharedStorage,
+    // END smem
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to B operand
+    typename TransformB_ = NumericArrayConverter<typename SmemIteratorB_::Element,
+                                                 typename IteratorB_::Element,
+                                                 IteratorB_::Fragment::kElements>,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaPipelinedFromSharedMemory
+    : public MmaBaseFromSharedMemory<Shape_, AccumulatorSharedStorage::Shape::kN, Policy_, 2> {
+public:
+    ///< Base class
+    using Base = MmaBaseFromSharedMemory<Shape_, AccumulatorSharedStorage::Shape::kN, Policy_, 2>;
+
+    using Shape = Shape_;  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+    static constexpr bool ScaleOperandA = ScaleOperandA_;
+
+    ///< loads fragments of A_scale from shared memory if operand A scaling is
+    ///< enabled. otherwise no-op.
+    using WarpIteratorAScale = typename cutlass::platform::conditional<
+        ScaleOperandA,
+        WarpIteratorA,
+        NoOpWarpIteratorScale<typename WarpIteratorA::TensorRef>>::type;
+
+    using IteratorB = IteratorB_;  ///< Iterates over tiles of B operand in global memory
+    using ElementC = ElementC_;    ///< Data type of accumulator matrix
+    using LayoutC = LayoutC_;      ///< Layout of accumulator matrix
+    using Policy = Policy_;        ///< Policy describing tuning details
+
+    using SmemIteratorB = SmemIteratorB_;
+
+    using TransformB = TransformB_;
+
+    //
+    // Dependent types
+    //
+
+    /// Fragment of operand B loaded from global memory
+    using FragmentB = typename IteratorB::Fragment;
+
+    /// Fragment of accumulator tile
+    using FragmentC = typename Policy::Operator::FragmentC;
+
+    /// Warp-level Mma
+    using Operator = typename Policy::Operator;
+
+    /// Obtain the arch tag from the warp-level operator
+    using ArchTag = typename Policy::Operator::ArchTag;
+
+    /// Complex transform on B operand
+    static ComplexTransform const kTransformB = Operator::kTransformB;
+
+    // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+    static_assert((Base::kStages == 2), "MmaPipelined requires kStages set to value 2");
+
+private:
+    using WarpFragmentA = typename Operator::FragmentA;
+
+    /// fragment type of OperandA elementwise scaling matrix. (almost) empty
+    /// if operand A scaling is disabled.
+    using WarpFragmentAScale = typename WarpIteratorAScale::Fragment;
+
+    using WarpFragmentB = typename Operator::FragmentB;
+
+    /// applies scaling factor to operand A fragment if operand A scaling is
+    /// enabled. otherwise no-op.
+    using FragmentAScaler =
+        FragmentElementwiseScaler<WarpFragmentA, WarpFragmentAScale, ScaleOperandA>;
+
+protected:
+    // /// Iterator to write threadblock-scoped tile of A operand to shared memory
+    // SmemIteratorA smem_iterator_A_;
+
+    /// Iterator to write threadblock-scoped tile of B operand to shared memory
+    SmemIteratorB smem_iterator_B_;
+
+    /// Iterator to load a warp-scoped tile of A operand from intermediate
+    /// accumulator tile
+    WarpIteratorA warp_tile_iterator_A_;
+
+    /// Iterator to load a warp-scoped tile of A_scale from intermediate
+    /// accumulator tile (only used if ScaleOperandA_ is true)
+    WarpIteratorAScale warp_tile_iterator_A_scale_;
+
+public:
+    /// constructor for MMA with operand A scaling enabled.
+    CUTLASS_DEVICE
+    MmaPipelinedFromSharedMemory(
+        // shared storage needed for internal use by threadblock-scoped GEMM
+        typename Base::SharedStorage& shared_storage,
+        // warp iterator over A tile held in shared memory
+        WarpIteratorA warp_iter_a,
+        // warp iterator over A_scale tile held in shared memory
+        WarpIteratorAScale warp_iter_a_scale,
+        int thread_idx,
+        int warp_idx,
+        int lane_idx)
+        : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+          warp_tile_iterator_A_(warp_iter_a),
+          warp_tile_iterator_A_scale_(warp_iter_a_scale),
+          smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+    {
+        // Compute warp location within threadblock tile by mapping the warp_id to
+        // three coordinates:
+        //   _m: the warp's position within the threadblock along the M dimension
+        //   _n: the warp's position within the threadblock along the N dimension
+        //   _k: the warp's position within the threadblock along the K dimension
+        int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+        int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+        int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+        int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+        // Add per-warp offsets in units of warp-level tiles
+        this->warp_tile_iterator_A_.add_tile_offset(
+            {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+        this->warp_tile_iterator_A_scale_.add_tile_offset(
+            {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+        this->warp_tile_iterator_B_.add_tile_offset(
+            {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+    }
+
+    /// Construct from tensor references
+    CUTLASS_DEVICE
+    MmaPipelinedFromSharedMemory(
+        typename Base::SharedStorage& shared_storage,  ///< Shared storage needed for internal use
+                                                       ///< by threadblock-scoped GEMM
+        AccumulatorSharedStorage& accumulator_shared_storage,
+        int thread_idx,  ///< ID within the threadblock
+        int warp_idx,    ///< ID of warp
+        int lane_idx,    ///< ID of each thread within a warp
+        int problem_size_0_n)
+        : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+          warp_tile_iterator_A_(accumulator_shared_storage.accum_ref(), lane_idx),
+          smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+    {
+        // Compute warp location within threadblock tile by mapping the warp_id to
+        // three coordinates:
+        //   _m: the warp's position within the threadblock along the M dimension
+        //   _n: the warp's position within the threadblock along the N dimension
+        //   _k: the warp's position within the threadblock along the K dimension
+
+        int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+        int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+        int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+        int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+        // Add per-warp offsets in units of warp-level tiles
+        this->warp_tile_iterator_A_.add_tile_offset(
+            {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+        this->warp_tile_iterator_B_.add_tile_offset(
+            {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+    }
+
+    // For API compatibility with MmaMultistageFromSharedMemory
+    // but not supported as it worsens perf: older gpus < sm80 don't
+    // support async transfers and have to waste registers
+    CUTLASS_DEVICE
+    void set_prologue_done(bool value) {}
+    CUTLASS_DEVICE
+    static void prologue(typename Base::SharedStorage& shared_storage,
+                         IteratorB iterator_B1,
+                         int thread_idx,
+                         int problem_size_0_n)
+    {
+    }
+
+    CUTLASS_DEVICE
+    static void drain_cp_asyncs() {}
+
+    /// Perform a threadblock-scoped matrix multiply-accumulate
+    CUTLASS_DEVICE
+    void operator()(int gemm_k_iterations,  ///< number of iterations of the mainloop
+                    FragmentC& accum,       ///< destination accumulator tile
+                    // IteratorA iterator_A,                             ///< iterator over A
+                    // operand in global memory
+                    IteratorB iterator_B,        ///< iterator over B operand in global memory
+                    FragmentC const& src_accum,  ///< source accumulator tile
+                    // TransformA transform_A = TransformA(),            ///< transformation
+                    // applied to A fragment
+                    TransformB transform_B = TransformB())
+    {  ///< transformation applied to B fragment
+
+        //
+        // Prologue
+        //
+
+        // Perform accumulation in the 'd' output operand
+        accum = src_accum;
+
+        FragmentB tb_frag_B;
+
+        tb_frag_B.clear();
+
+        // The last kblock is loaded in the prolog
+        iterator_B.set_residual_tile(gemm_k_iterations == 1);
+        iterator_B.load(tb_frag_B);
+
+        ++iterator_B;
+
+        this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+        ++this->smem_iterator_B_;
+
+        __syncthreads();
+
+        // remember that WarpFragmentAScale and WarpIteratorAScale are empty/no-op
+        // if scaling is disabled.
+
+        // Pair of fragments used to overlap shared memory loads and math
+        // instructions
+        WarpFragmentA warp_frag_A[2];
+        WarpFragmentAScale warp_frag_A_scale[2];
+        WarpFragmentB warp_frag_B[2];
+        warp_frag_A[0].clear();
+        warp_frag_A_scale[0].clear();
+        warp_frag_B[0].clear();
+
+        this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+        this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+        this->warp_tile_iterator_A_scale_.load(warp_frag_A_scale[0]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_A_scale_;
+        ++this->warp_tile_iterator_B_;
+
+        Operator warp_mma;
+
+        int smem_write_stage_idx = 1;
+
+        // Avoid reading out of bounds
+        iterator_B.set_residual_tile(gemm_k_iterations == 2);
+        iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+        // Issue loads during the first warp-level matrix multiply-add *AFTER*
+        // issuing shared memory loads (which have the tightest latency requirement).
+
+        //
+        // Mainloop
+        //
+
+        // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+        CUTLASS_GEMM_LOOP
+        for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+            //
+            // Loop over GEMM K dimension
+            //
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+                // Load warp-level tiles from shared memory, wrapping to k offset if
+                // this is the last group as the case may be.
+                bool hasNext = true;
+
+                if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+                    // Write fragments to shared memory
+                    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+                    __syncthreads();
+
+                    ++this->smem_iterator_B_;
+
+                    // Add negative offsets to return iterators to the 'start' of the
+                    // circular buffer in shared memory SMEM: Don't reset iterator A, as
+                    // we are continuing our iteration at this point
+                    if (smem_write_stage_idx == 1) {
+                        this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+                    } else {
+                        this->warp_tile_iterator_B_.add_tile_offset(
+                            {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+                    }
+
+                    smem_write_stage_idx ^= 1;
+                    hasNext = gemm_k_iterations > 1;
+                }
+
+                // Only read the next if we need to
+                if (hasNext) {
+                    this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) %
+                                                                 Base::kWarpGemmIterations);
+
+                    this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+                    this->warp_tile_iterator_A_scale_.load(warp_frag_A_scale[(warp_mma_k + 1) % 2]);
+                    this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+                    ++this->warp_tile_iterator_A_;
+                    ++this->warp_tile_iterator_A_scale_;
+                    ++this->warp_tile_iterator_B_;
+
+                    if (warp_mma_k == 0) {
+                        iterator_B.load(tb_frag_B);
+
+                        ++iterator_B;
+
+                        // Avoid reading out of bounds if this was the last loop iteration
+                        iterator_B.set_residual_tile(gemm_k_iterations == 3);
+                        iterator_B.clear_mask(gemm_k_iterations <= 2);
+                    }
+                }
+
+                warp_mma(accum,
+                         FragmentAScaler::apply(warp_frag_A[warp_mma_k % 2],
+                                                warp_frag_A_scale[warp_mma_k % 2]),
+                         warp_frag_B[warp_mma_k % 2],
+                         accum);
+            }
+        }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA1_,
+    /// whether or not to perform elementwise multiplication of A
+    //  by another matrix (A_scale) that is also kept in shared memory prior
+    //  to matmul A @ B
+    bool ScaleOperandA_,
+    // Accumulator type
+    typename AccumulatorSharedStorage,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages_,
+    int kMaxK_,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMultistageFromSharedMemory
+    : public MmaBaseFromSharedMemory<Shape1_, kMaxK_, Policy1_, Stages_> {
+public:
+    ///< Base class
+    using Base = MmaBaseFromSharedMemory<Shape1_, kMaxK_, Policy1_, Stages_>;
+
+    ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+    using Shape1 = Shape1_;
+    ///< Iterates over tiles of B operand in global memory
+    using IteratorB1 = IteratorB1_;
+    using IteratorB = IteratorB1;
+    ///< Policy describing tuning details
+    using Policy1 = Policy1_;
+
+    using SmemIteratorB1 = SmemIteratorB1_;
+    using WarpIteratorA1 = WarpIteratorA1_;  ///< Iterates over the intermediate
+                                             ///< accumulator tile in shared memory
+    static constexpr bool ScaleOperandA = ScaleOperandA_;
+
+    ///< warp level iterator over A_scale matrix tile kept in shared memory.
+    ///< if elementwise A scaling is disabled then everything this does is no-op.
+    using WarpIteratorAScale = typename cutlass::platform::conditional<
+        ScaleOperandA,
+        WarpIteratorA1,
+        NoOpWarpIteratorScale<typename WarpIteratorA1::TensorRef>>::type;
+    ///< Data type of accumulator matrix
+    using ElementC = ElementC_;
+    ///< Layout of accumulator matrix
+    using LayoutC = LayoutC_;
+
+    static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+    static constexpr bool kSmemContainsEntireB = Base::kSmemContainsEntireB;
+
+    //
+    // Dependent types
+    //
+
+    /// Fragment of accumulator tile
+    using FragmentC1 = typename Policy1::Operator::FragmentC;
+    using FragmentC = FragmentC1;
+
+    /// Warp-level Mma
+    using Operator1 = typename Policy1::Operator;
+
+    /// Minimum architecture is Sm80 to support cp.async
+    using ArchTag = arch::Sm80;
+
+    /// Complex transform on B operand
+    static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+    /// Internal structure exposed for introspection.
+    struct Detail {
+        static_assert(Base::kWarpGemmIterations1 > 1,
+                      "The pipelined structure requires at least two warp-level "
+                      "GEMM operations.");
+
+        /// Number of cp.async instructions to load one stage of operand B
+        static int const TBLoadIterationsB1 = IteratorB1::ThreadMap::Iterations::kCount;
+
+        /// Number of cp.async instructions to load on group of operand B
+        static int const kAccessesPerGroupB1 =
+            (TBLoadIterationsB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+    };
+
+    static constexpr int kNumStagesConcurrentLoad = kSmemContainsEntireB ? Base::kStages
+                                                                         : Base::kStages - 1;
+
+private:
+    using WarpLoadedFragmentA1 = typename Operator1::FragmentA;
+    /// fragment of OperandA scale matrix. if operand A scaling is disabled this
+    /// is (almost) empty.
+    using WarpLoadedFragmentA1Scale = typename WarpIteratorAScale::Fragment;
+    using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+    using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+    using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+    /// applies elementwise scaling to fragment of A. if operand A scaling is
+    /// disabled this is a no-op.
+    using FragmentAScaler =
+        FragmentElementwiseScaler<WarpLoadedFragmentA1, WarpLoadedFragmentA1Scale, ScaleOperandA>;
+
+private:
+    //
+    // Data members
+    //
+
+    /// Iterator to load a warp-scoped tile of A1 operand from intermediate
+    /// accumulator tile
+    WarpIteratorA1 warp_tile_iterator_A1_;
+
+    /// Iterator to load a warp-scoped tile of A1_scale operand from shared memory
+    /// if operand A scaling is disabled everything this does is a no-op.
+    WarpIteratorAScale warp_tile_iterator_A1_scale_;
+
+    /// Iterator to write threadblock-scoped tile of B operand to shared memory
+    SmemIteratorB1 smem_iterator_B1_;
+
+    bool prologue_done_;
+
+public:
+    /// constructor for MMA with operand A scaling enabled.
+    CUTLASS_DEVICE
+    MmaMultistageFromSharedMemory(
+        // shared storage needed for internal use by threadblock-scoped GEMM
+        typename Base::SharedStorage& shared_storage,
+        // warp level iterator over operand A tile kept in shared memory
+        WarpIteratorA1 warp_tile_iterator_A1,
+        // warp level iterator over operand A elementwise scale tile kept in
+        // shared memory.
+        WarpIteratorAScale warp_tile_iterator_A1_scale,
+        int thread_idx,
+        int warp_idx,
+        int lane_idx)
+        : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+          warp_tile_iterator_A1_(warp_tile_iterator_A1),
+          warp_tile_iterator_A1_scale_(warp_tile_iterator_A1_scale),
+          smem_iterator_B1_(shared_storage.operand_B_ref(), thread_idx),
+          prologue_done_(false)
+    {
+        // Compute warp location within threadblock tile by mapping the warp_id to
+        // three coordinates:
+        //   _m: the warp's position within the threadblock along the M dimension
+        //   _n: the warp's position within the threadblock along the N dimension
+        //   _k: the warp's position within the threadblock along the K dimension
+        int warp_idx_mn_1 = warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+        int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+        int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+        int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+        // Add per-warp offsets in units of warp-level tiles
+        warp_tile_iterator_A1_.add_tile_offset(
+            {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+        warp_tile_iterator_A1_scale_.add_tile_offset(
+            {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+        this->warp_tile_iterator_B_.add_tile_offset(
+            {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
+    }
+
+    /// Construct from tensor references
+    CUTLASS_DEVICE
+    MmaMultistageFromSharedMemory(
+        typename Base::SharedStorage& shared_storage,  ///< Shared storage needed for internal use
+                                                       ///< by threadblock-scoped GEMM
+        AccumulatorSharedStorage& accumulator_shared_storage,
+        ///< ID within the threadblock
+        int thread_idx,
+        ///< ID of warp
+        int warp_idx,
+        ///< ID of each thread within a warp
+        int lane_idx,
+        ///< GEMM0 N is used for accumulator extent
+        int problem_size_0_n)
+        : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+          warp_tile_iterator_A1_(accumulator_shared_storage.accum_ref(), lane_idx),
+          smem_iterator_B1_(shared_storage.operand_B_ref(), thread_idx),
+          prologue_done_(false)
+    {
+        // Compute warp location within threadblock tile by mapping the warp_id to
+        // three coordinates:
+        //   _m: the warp's position within the threadblock along the M dimension
+        //   _n: the warp's position within the threadblock along the N dimension
+        //   _k: the warp's position within the threadblock along the K dimension
+
+        int warp_idx_mn_1 = warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+        int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+
+        int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+        int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+        // Add per-warp offsets in units of warp-level tiles
+        warp_tile_iterator_A1_.add_tile_offset(
+            {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+        this->warp_tile_iterator_B_.add_tile_offset(
+            {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
+    }
+
+    CUTLASS_DEVICE
+    void set_prologue_done(bool value) { prologue_done_ = value; }
+
+    CUTLASS_DEVICE
+    static void prologue(typename Base::SharedStorage& shared_storage,
+                         IteratorB iterator_B1,
+                         int thread_idx,
+                         int problem_size_0_n)
+    {
+        SmemIteratorB1 smem_iterator_B1(shared_storage.operand_B_ref(), thread_idx);
+        _prologue(iterator_B1,
+                  (problem_size_0_n + Base::Shape::kK - 1) / Base::Shape::kK,
+                  smem_iterator_B1);
+    }
+
+    CUTLASS_DEVICE
+    static void drain_cp_asyncs()
+    {
+        // commit and drain all pending and predicated cp.async pnz from the GEMM
+        // mainloop
+        cutlass::arch::cp_async_fence();
+        cutlass::arch::cp_async_wait<0>();
+        __syncthreads();
+    }
+
+    CUTLASS_DEVICE
+    void copy_tiles_and_advance_1(IteratorB1& iterator_B1, int group_start_B1 = 0)
+    {
+        iterator_B1.set_iteration_index(group_start_B1 * IteratorB1::kAccessesPerVector);
+        this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+
+        // Load for operand B
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+            if (group_start_B1 + j < Detail::TBLoadIterationsB1) {
+                typename IteratorB1::AccessType* dst_ptr =
+                    reinterpret_cast<typename IteratorB1::AccessType*>(
+                        this->smem_iterator_B1_.get());
+
+                int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+                                      IteratorB1::ThreadMap::kElementsPerAccess /
+                                      IteratorB1::kAccessesPerVector / 8;
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+                    auto gmem_ptr = iterator_B1.get();
+
+                    cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+                        dst_ptr + v, gmem_ptr, iterator_B1.valid());
+
+                    ++iterator_B1;
+                }
+                ++this->smem_iterator_B1_;
+            }
+        }
+    }
+
+    CUTLASS_DEVICE
+    static void _prologue(IteratorB& iterator_B1,
+                          int32_t gemm_k_iterations_1,
+                          SmemIteratorB1& smem_iterator_B1_)
+    {
+        // Issue several complete stages
+        CUTLASS_PRAGMA_UNROLL
+        for (int stage = 0; stage < kNumStagesConcurrentLoad; ++stage, --gemm_k_iterations_1) {
+            iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+            iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+            iterator_B1.set_iteration_index(0);
+            smem_iterator_B1_.set_iteration_index(0);
+
+            // Load for operand B
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+                typename IteratorB1::AccessType* dst_ptr =
+                    reinterpret_cast<typename IteratorB1::AccessType*>(smem_iterator_B1_.get());
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+                    int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+                                          IteratorB1::ThreadMap::kElementsPerAccess /
+                                          IteratorB1::kAccessesPerVector / 8;
+
+                    cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+                        dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
+
+                    ++iterator_B1;
+                }
+
+                ++smem_iterator_B1_;
+            }
+
+            // Move to the next stage
+            iterator_B1.add_tile_offset({1, 0});
+
+            smem_iterator_B1_.add_tile_offset({1, 0});
+
+            // Defines the boundary of a stage of cp.async.
+            cutlass::arch::cp_async_fence();
+        }
+        iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+        iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+    }
+
+    /// Perform a threadblock-scoped matrix multiply-accumulate
+    CUTLASS_DEVICE
+    void operator()(
+        ///< problem size of GEMM
+        int gemm_k_iterations_1_,
+        ///< destination accumulator tile
+        FragmentC1& accum,
+        ///< iterator over B1 operand in global memory
+        IteratorB1 iterator_B1,
+        ///< initial value of accumulator
+        FragmentC1 const& src_accum)
+    {
+        // 2nd Gemm
+
+        //
+        // Prologue
+        //
+        // Perform accumulation in the 'd' output operand
+        accum = src_accum;
+
+        if (!prologue_done_) {
+            _prologue(iterator_B1, gemm_k_iterations_1_, smem_iterator_B1_);
+        } else if (!kSmemContainsEntireB) {
+            // Restore the iterators increments
+
+            int gemm_k_iterations_1 = gemm_k_iterations_1_;
+            // Issue several complete stages
+            CUTLASS_PRAGMA_UNROLL
+            for (int stage = 0; stage < kNumStagesConcurrentLoad; ++stage, --gemm_k_iterations_1) {
+                iterator_B1.set_iteration_index(0);
+                this->smem_iterator_B1_.set_iteration_index(0);
+
+                // Load for operand B
+                CUTLASS_PRAGMA_UNROLL
+                for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) { ++iterator_B1; }
+                    ++this->smem_iterator_B1_;
+                }
+                iterator_B1.add_tile_offset({1, 0});
+                this->smem_iterator_B1_.add_tile_offset({1, 0});
+            }
+            iterator_B1.set_residual_tile(gemm_k_iterations_1 <= 1);
+            iterator_B1.clear_mask(gemm_k_iterations_1 <= 0);
+        }
+
+        // DEPBAR+SYNC
+        cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+        __syncthreads();
+
+        // remember that WarpFragmentAScale and WarpIteratorAScale are no-op/empty
+        // if scaling is disabled.
+
+        // Pair of fragments used to overlap shared memory loads and math
+        // instructions
+        WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+        WarpLoadedFragmentA1Scale warp_loaded_frag_A1_scale[2];
+        WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+        WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+        WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+        Operator1 warp_mma1;
+
+        warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
+        ++warp_tile_iterator_A1_;
+
+        warp_tile_iterator_A1_scale_.load(warp_loaded_frag_A1_scale[0]);
+        ++warp_tile_iterator_A1_scale_;
+
+        this->warp_tile_iterator_B_.set_kgroup_index(0);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B1[0]);
+        ++this->warp_tile_iterator_B_;
+
+        int smem_write_stage_idx = Base::kStages - 1;
+        int smem_read_stage_idx = 0;
+
+        warp_mma1.transform(
+            warp_transformed_frag_A1[0],
+            warp_transformed_frag_B1[0],
+            FragmentAScaler::apply(warp_loaded_frag_A1[0], warp_loaded_frag_A1_scale[0]),
+            warp_loaded_frag_B1[0]);
+
+        // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+        // accumulator and this temporary accumulator is added to the final
+        // accumulator once in every mainloop iteration.
+        plus<FragmentC1> plus_accum;
+
+        FragmentC1 tmp_accum;
+
+        if (platform::is_same<typename Operator1::MathOperator,
+                              arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<typename Operator1::MathOperator,
+                              arch::OpMultiplyAddComplexFastF32>::value) {
+            tmp_accum.clear();
+        }
+
+        //
+        // Mainloop
+        //
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int gemm_k_iterations_1 = gemm_k_iterations_1_ - (Base::kStages - 1);
+             gemm_k_iterations_1 > (-Base::kStages + 1);
+             gemm_k_iterations_1--) {
+            //
+            // Loop over GEMM K dimension
+            //
+
+            // Computes a warp-level GEMM on data held in shared memory
+            // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+            CUTLASS_PRAGMA_UNROLL
+            for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) {
+                // Load warp-level tile from accumulator fragment (A)
+                // or shared memory (operand B)
+                this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) %
+                                                             Base::kWarpGemmIterations1);
+                // skip warp tile loading for the last kgroup (we are out of the buf)
+                if (gemm_k_iterations_1 > (-Base::kStages + 2) ||
+                    warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+                    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
+                    warp_tile_iterator_A1_scale_.load(
+                        warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2]);
+                    this->warp_tile_iterator_B_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+                }
+                ++warp_tile_iterator_A1_;
+                ++warp_tile_iterator_A1_scale_;
+                ++this->warp_tile_iterator_B_;
+
+                if (warp_mma_k > 0)
+                    warp_mma1.transform(
+                        warp_transformed_frag_A1[warp_mma_k % 2],
+                        warp_transformed_frag_B1[warp_mma_k % 2],
+                        FragmentAScaler::apply(warp_loaded_frag_A1[warp_mma_k % 2],
+                                               warp_loaded_frag_A1_scale[warp_mma_k % 2]),
+                        warp_loaded_frag_B1[warp_mma_k % 2]);
+
+                if (platform::is_same<typename Operator1::MathOperator,
+                                      arch::OpMultiplyAddFastF32>::value ||
+                    platform::is_same<typename Operator1::MathOperator,
+                                      arch::OpMultiplyAddComplexFastF32>::value) {
+                    warp_mma1(tmp_accum,
+                              warp_transformed_frag_A1[warp_mma_k % 2],
+                              warp_transformed_frag_B1[warp_mma_k % 2],
+                              tmp_accum);
+
+                    if (warp_mma_k == 0) {
+                        accum = plus_accum(accum, tmp_accum);
+                        tmp_accum.clear();
+                    }
+                } else {
+                    warp_mma1(accum,
+                              warp_transformed_frag_A1[warp_mma_k % 2],
+                              warp_transformed_frag_B1[warp_mma_k % 2],
+                              accum);
+                }
+
+                // Issue global->shared copies for the this stage
+                if (warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+                    int group_start_iteration_B1;
+
+                    group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1;
+
+                    if (!kSmemContainsEntireB) {
+                        copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+                    }
+                }
+
+                if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+                    int group_start_iteration_B1;
+                    group_start_iteration_B1 = (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+
+                    if (!kSmemContainsEntireB) {
+                        copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+                    }
+
+                    // Inserts a memory fence between stages of cp.async instructions.
+                    cutlass::arch::cp_async_fence();
+
+                    // Waits until kStages-2 stages have committed.
+                    arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+                    __syncthreads();
+
+                    // Move to the next stage
+                    iterator_B1.add_tile_offset({1, 0});
+
+                    this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+                    // Add negative offsets to return iterators to the 'start' of the
+                    // circular buffer in shared memory
+                    if (!kSmemContainsEntireB) {
+                        if (smem_write_stage_idx == (Base::kStages - 1)) {
+                            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+                            smem_write_stage_idx = 0;
+                        } else {
+                            ++smem_write_stage_idx;
+                        }
+
+                        if (smem_read_stage_idx == (Base::kStages - 1)) {
+                            this->warp_tile_iterator_B_.add_tile_offset(
+                                {-Base::kStages * Policy1::kPartitionsK *
+                                     Base::kWarpGemmIterations1,
+                                 0});
+                            smem_read_stage_idx = 0;
+                        } else {
+                            ++smem_read_stage_idx;
+                        }
+                    }
+
+                    iterator_B1.set_residual_tile(gemm_k_iterations_1 == 2);
+                    iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
+                }
+
+                // Do any conversions feeding the first stage at the end of the loop so
+                // we can start right away on mma instructions
+                if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+                    warp_mma1.transform(
+                        warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+                        warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+                        FragmentAScaler::apply(warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+                                               warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2]),
+                        warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+            }
+        }
+
+        if (platform::is_same<typename Operator1::MathOperator,
+                              arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<typename Operator1::MathOperator,
+                              arch::OpMultiplyAddComplexFastF32>::value) {
+            accum = plus_accum(accum, tmp_accum);
+        }
+    }
+};
+
+template <typename WarpShape,
+          typename InstructionShape,
+          typename RegularWarpIterator,
+          typename Policy,
+          typename Enable = void>
+struct DefaultWarpIteratorAFromSharedMemory {};
+
+// TensorOp - Ampere half
+template <typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    RegularWarpIterator,
+    Policy,
+    typename platform::enable_if<(sizeof_bits<typename RegularWarpIterator::Element>::value == 16 &&
+                                  Policy::Operator::Policy::OpDelta::kRow == 1)>::type> {
+    static constexpr auto kWarpSize = 32;
+    using OpDelta = typename Policy::Operator::Policy::OpDelta;
+    using WarpShape = cutlass::MatrixShape<32, 32>;
+
+    using WarpIterator =
+        cutlass::gemm::warp::WarpIteratorFromSmem<cutlass::gemm::Operand::kA,
+                                                  typename RegularWarpIterator::Element>;
+};
+
+// TensorOp - Ampere f32
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    RegularWarpIterator,
+    Policy,
+    typename platform::enable_if<(sizeof_bits<typename RegularWarpIterator::Element>::value != 16 ||
+                                  Policy::Operator::Policy::OpDelta::kRow != 1)>::type> {
+    using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+    static constexpr auto kWarpSize = 32;
+    using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+    using WarpIterator = cutlass::gemm::warp::MmaTensorOpMultiplicandTileAccessIterator<
+        cutlass::MatrixShape<WarpShape::kM, WarpShape::kK>,
+        cutlass::gemm::Operand::kA,
+        typename RegularWarpIterator::Element,
+        cutlass::layout::RowMajor,
+        cutlass::MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+        OpDelta::kRow,
+        kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<WarpShape,
+                                            cutlass::gemm::GemmShape<16, 16, 4>,
+                                            RegularWarpIterator,
+                                            Policy> {
+    using InstructionShape = cutlass::gemm::GemmShape<16, 16, 4>;
+    static constexpr auto kWarpSize = 32;
+    using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+    using WarpIterator = cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator<
+        cutlass::MatrixShape<32, 32>,  // MatrixShape<WarpShape::kM,
+                                       // WarpShape::kK>,
+        cutlass::gemm::Operand::kA,
+        typename RegularWarpIterator::Element,
+        cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>,
+        cutlass::MatrixShape<16, 4>,
+        OpDelta::kRow,
+        kWarpSize>;
+};
+
+// Simt
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<WarpShape,
+                                            cutlass::gemm::GemmShape<1, 1, 1>,
+                                            RegularWarpIterator,
+                                            Policy> {
+    using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+    static constexpr auto kWarpSize = 32;
+
+    // We just use the same iterator, as we reproduced the same shared-memory
+    // schema. Just modify it to handle non-complete tiles.
+    using WarpIterator = RegularWarpIterator;
+};
+
+// Converts a "regular" Mma into their counterpart from shared memory
+template <typename Mma_,
+          typename AccumulatorSharedStorage,
+          /// whether or not to apply elementwise multiplication of operand A by
+          /// another matrix in shared memory before usage in A @ B
+          bool kScaleOperandA,
+          bool kTransposeA = false>
+struct DefaultMmaFromSharedMemory;
+
+// Mma pipelined
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to A operand
+    typename TransformA_,
+    /// Transformation applied to B operand
+    typename TransformB_,
+    typename AccumulatorSharedStorage_,
+    /// whether or not to apply elementwise multiplication of operand A by
+    /// another matrix in shared memory before usage in A @ B
+    bool kScaleOperandA,
+    bool kTransposeA>
+struct DefaultMmaFromSharedMemory<MmaPipelined<Shape_,
+                                               IteratorA_,
+                                               SmemIteratorA_,
+                                               IteratorB_,
+                                               SmemIteratorB_,
+                                               ElementC_,
+                                               LayoutC_,
+                                               Policy_,
+                                               TransformA_,
+                                               TransformB_>,
+                                  AccumulatorSharedStorage_,
+                                  kScaleOperandA,
+                                  kTransposeA> {
+    static constexpr int kWarpSize = 32;
+    using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+
+    using RegularMma = MmaPipelined<Shape_,
+                                    IteratorA_,
+                                    SmemIteratorA_,
+                                    IteratorB_,
+                                    SmemIteratorB_,
+                                    ElementC_,
+                                    LayoutC_,
+                                    Policy_,
+                                    TransformA_,
+                                    TransformB_>;
+
+    using WarpShape = typename Policy_::Operator::Shape;
+    using InstructionShape = typename Policy_::Operator::InstructionShape;
+    using ArchMmaOperator = typename Policy_::Operator;
+
+    static constexpr bool kIsTransposedA = false;
+    using WarpIteratorA =
+        typename DefaultWarpIteratorAFromSharedMemory<WarpShape,
+                                                      InstructionShape,
+                                                      typename RegularMma::Operator::IteratorA,
+                                                      Policy_>::WarpIterator;
+    using IteratorB =
+        typename cutlass::transform::threadblock::MakeIteratorResidualLast<IteratorB_>::Iterator;
+
+    using Mma =
+        typename cutlass::gemm::threadblock::MmaPipelinedFromSharedMemory<Shape_,
+                                                                          WarpIteratorA,
+                                                                          kScaleOperandA,
+                                                                          AccumulatorSharedStorage_,
+                                                                          IteratorB,
+                                                                          SmemIteratorB_,
+                                                                          ElementC_,
+                                                                          LayoutC_,
+                                                                          Policy_>;
+};
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    typename AccumulatorSharedStorage_,
+    /// whether or not to apply elementwise multiplication of operand A by
+    /// another matrix in shared memory before usage in A @ B
+    bool kScaleOperandA,
+    bool kTransposeA>
+struct DefaultMmaFromSharedMemory<MmaMultistage<Shape_,
+                                                IteratorA_,
+                                                SmemIteratorA_,
+                                                CacheOpA,
+                                                IteratorB_,
+                                                SmemIteratorB_,
+                                                CacheOpB,
+                                                ElementC_,
+                                                LayoutC_,
+                                                Policy_,
+                                                Stages,
+                                                SharedMemoryClear>,
+                                  AccumulatorSharedStorage_,
+                                  kScaleOperandA,
+                                  kTransposeA> {
+    static constexpr int kWarpSize = 32;
+
+    using RegularMma = MmaMultistage<Shape_,
+                                     IteratorA_,
+                                     SmemIteratorA_,
+                                     CacheOpA,
+                                     IteratorB_,
+                                     SmemIteratorB_,
+                                     CacheOpB,
+                                     ElementC_,
+                                     LayoutC_,
+                                     Policy_,
+                                     Stages,
+                                     SharedMemoryClear>;
+
+    using WarpShape = typename Policy_::Operator::Shape;
+    using InstructionShape = typename Policy_::Operator::InstructionShape;
+    using WarpIteratorA_ =
+        typename DefaultWarpIteratorAFromSharedMemory<WarpShape,
+                                                      InstructionShape,
+                                                      typename RegularMma::Operator::IteratorA,
+                                                      Policy_>::WarpIterator;
+    using WarpIteratorTranspose = TransposeWarpIterator<WarpIteratorA_>;
+    static constexpr bool kIsTransposedA = WarpIteratorTranspose::kSupportsTranspose && kTransposeA;
+    using WarpIteratorA = typename platform::
+        conditional<kIsTransposedA, typename WarpIteratorTranspose::Iterator, WarpIteratorA_>::type;
+
+    static int constexpr kMaxK = kIsTransposedA ? AccumulatorSharedStorage_::Shape::kM
+                                                : AccumulatorSharedStorage_::Shape::kN;
+    // Reduce the number of stages if we don't need that many
+    static int constexpr kStagesMax = (kMaxK + int(Shape_::kK) - 1) / int(Shape_::kK);
+    static int constexpr kStages = cutlass::const_min(Stages, kStagesMax);
+
+    using IteratorB =
+        typename cutlass::transform::threadblock::MakeIteratorResidualLast<IteratorB_>::Iterator;
+    using Mma = typename cutlass::gemm::threadblock::MmaMultistageFromSharedMemory<
+        Shape_,
+        WarpIteratorA,
+        kScaleOperandA,
+        AccumulatorSharedStorage_,
+        IteratorB,
+        SmemIteratorB_,
+        RegularMma::kCacheOpB,
+        ElementC_,
+        LayoutC_,
+        Policy_,
+        kStages,
+        kMaxK>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename IteratorC,
+          typename Operator,
+          typename scalar_t,
+          typename WarpShape_,
+          typename ThreadblockShape_>
+struct B2bGemm;
+
+// Tensor Cores >= Sm75 specialization (Ampere ...)
+template <  /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_,
+    typename Operator,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::
+        MmaTensorOpAccumulatorTileIterator<Shape_, Element_, Layout_, InstructionShape_, OpDelta_>,
+    Operator,
+    scalar_t,
+    WarpShape_,
+    ThreadblockShape_> {
+    using IteratorC = typename cutlass::gemm::warp::
+        MmaTensorOpAccumulatorTileIterator<Shape_, Element_, Layout_, InstructionShape_, OpDelta_>;
+    using FragmentC = typename IteratorC::Fragment;
+    using InstructionShape = InstructionShape_;
+    using WarpShape = WarpShape_;
+    using ThreadblockShape = ThreadblockShape_;
+    using accum_t = Element_;
+    using lse_scalar_t = float;
+
+    using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+
+    // Iterator to load accumulators (results of matmul in registers)
+    using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+        WarpShape,
+        InstructionShape,
+        accum_t,
+        typename Operator::Policy::Operator::FragmentC,
+        cutlass::layout::RowMajor>;
+
+    // Iterator to store to shared-memory
+    using SmemIteratorD0 =
+        typename cutlass::epilogue::warp::TileIteratorTensorOp<WarpShape,
+                                                               InstructionShape,
+                                                               scalar_t,  // accum_t,
+                                                               SmemAccumulatorLayout>;
+    using AccumulatorSharedStorage =
+        cutlass::gemm::threadblock::AccumulatorSharedStorage<ThreadblockShape,
+                                                             typename SmemIteratorD0::Element,
+                                                             typename SmemIteratorD0::TensorLayout,
+                                                             typename SmemIteratorD0::Padding>;
+    // We need to provide an operation for the epilogue. Let's create an
+    // operation that does nothing (ScaleType::Nothing), just converts
+    // from accum_t (float) -> scalar_t (can be half)
+    using OutputOpNoOp = cutlass::epilogue::thread::LinearCombination<
+        typename SmemIteratorD0::Element,  // ElementOutput
+        FragmentIteratorAccumulator::Fragment::kElements,
+        accum_t,                           // ElementAccumulator
+        typename SmemIteratorD0::Element,  // ElementCompute
+        cutlass::epilogue::thread::ScaleType::Nothing>;
+    using Epilogue = cutlass::epilogue::threadblock::EpilogueSmemAccumulator<
+        SmemIteratorD0,
+        FragmentIteratorAccumulator,
+        SmemIteratorD0,  // ScaleBiasIterator
+                         // - not used
+        OutputOpNoOp>;
+
+    // Epilogue 2: with LSE (for backwards pass)
+    static int const kElementsPerAccess = 2;  // TODO: Why 2?
+    using IteratorAccumulatorLSE = cutlass::transform::threadblock::VectorIterator<
+        cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+            // Shape
+            cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kN>,
+            // WarpShape
+            cutlass::MatrixShape<WarpShape::kM, WarpShape::kN>,
+            lse_scalar_t,
+            cutlass::layout::RowMajor,
+            kElementsPerAccess>>;
+    using EpilogueOpApplyLSE = cutlass::epilogue::thread::ApplyLogSumExp<
+        scalar_t,      // ElementOutput_
+        lse_scalar_t,  // ElementLSE_
+        accum_t,       // ElementAccumulator_
+        accum_t,       // ElementCompute_
+        128 / cutlass::sizeof_bits<scalar_t>::value
+        // FragmentIteratorAccumulator::Fragment::kElements
+        // InstructionShape::kM * InstructionShape::kN / 32
+        >;
+    using EpilogueWithLSE =
+        cutlass::epilogue::threadblock::EpilogueSmemAccumulator<SmemIteratorD0,
+                                                                FragmentIteratorAccumulator,
+                                                                IteratorAccumulatorLSE,
+                                                                EpilogueOpApplyLSE>;
+
+    static void CUTLASS_DEVICE accumToSmem(AccumulatorSharedStorage& shared_storage,
+                                           FragmentC const& accum,
+                                           int lane_id,
+                                           cutlass::MatrixCoord const& tile_coords)
+    {
+        SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id);
+        smem_iterator_attn.add_tile_offset(
+            tile_coords * cutlass::MatrixCoord{SmemIteratorD0::TileIterations::kRow,
+                                               SmemIteratorD0::TileIterations::kColumn});
+        Epilogue epilogue;
+        epilogue(OutputOpNoOp({}), smem_iterator_attn, accum);
+    }
+
+    static void CUTLASS_DEVICE accumApplyLSEToSmem(AccumulatorSharedStorage& shared_storage,
+                                                   FragmentC& accum,
+                                                   lse_scalar_t const* lse,
+                                                   int32_t lse_extents,
+                                                   int thread_id,
+                                                   int warp_id,
+                                                   int lane_id,
+                                                   cutlass::MatrixCoord const& tile_coords)
+    {
+        constexpr int32_t kAlignLSE = 32;
+        IteratorAccumulatorLSE iterator_lse(
+            lse,
+            {(int32_t)0, (int32_t)ceil_div(lse_extents, kAlignLSE) * kAlignLSE},
+            thread_id,
+            warp_id,
+            cutlass::MatrixCoord{0, 0}  // offset
+        );
+
+        SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id);
+        smem_iterator_attn.add_tile_offset(
+            tile_coords * cutlass::MatrixCoord{SmemIteratorD0::TileIterations::kRow,
+                                               SmemIteratorD0::TileIterations::kColumn});
+        EpilogueWithLSE epilogue;
+        EpilogueOpApplyLSE minus_lse_exp({});
+        epilogue(minus_lse_exp,
+                 smem_iterator_attn,
+                 accum,
+                 // scale - unused
+                 iterator_lse,
+                 // bias
+                 iterator_lse);
+    }
+};
+
+// Volta Specialization
+// only supported for f16
+template <typename Operator, typename WarpShape_, typename ThreadblockShape_>
+struct B2bGemm<cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+                   cutlass::MatrixShape<32, 32>,
+                   float,
+                   cutlass::layout::RowMajor,
+                   cutlass::gemm::GemmShape<16, 16, 4>,
+                   cutlass::MatrixShape<1, 1>>,
+               Operator,
+               cutlass::half_t,
+               WarpShape_,
+               ThreadblockShape_> {
+    using IteratorC = cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        cutlass::MatrixShape<32, 32>,
+        float,
+        cutlass::layout::RowMajor,
+        cutlass::gemm::GemmShape<16, 16, 4>,
+        cutlass::MatrixShape<1, 1>>;
+    using scalar_t = cutlass::half_t;
+    using accum_t = IteratorC::Element;
+    using WarpShape = WarpShape_;
+    using ThreadblockShape = ThreadblockShape_;
+    using FragmentC = IteratorC::Fragment;
+    using lse_scalar_t = float;
+
+    using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+    using SmemIteratorD0 =
+        cutlass::epilogue::warp::TileIteratorVoltaTensorOp<WarpShape,
+                                                           cutlass::gemm::GemmShape<32, 32, 4>,
+                                                           scalar_t,
+                                                           SmemAccumulatorLayout>;
+
+    // // Storage in shared-memory for Q.Kt
+    using AccumulatorSharedStorage = cutlass::gemm::threadblock::AccumulatorSharedStorage<
+        ThreadblockShape,
+        scalar_t,
+        cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+            16,
+            32>,                    // typename SmemIteratorD0::TensorLayout,
+        cutlass::MatrixShape<0, 0>  // Padding
+        >;
+
+    using OutputLayout = cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>;
+    using TensorRef = cutlass::TensorRef<scalar_t, OutputLayout>;
+    using Policy = typename IteratorC::Policy;
+    using Element = accum_t;
+    // Those are MmaVoltaTensorOpAccumulatorTileIterator private fields
+    // Let's copy their values
+    static int const kElementsPerPartial = 4;
+    using EleShapePerPatial =
+        typename cutlass::platform::conditional<cutlass::platform::is_same<Element, float>::value,
+                                                cutlass::MatrixShape<2, 2>,
+                                                cutlass::MatrixShape<1, 4>>::type;
+    static int const kElementsPerMma = 8;
+    static int const kAccumulatorPatials = 2;
+    using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+    static void CUTLASS_DEVICE accumToSmem(AccumulatorSharedStorage& shared_storage,
+                                           FragmentC const& accum,
+                                           int lane_id,
+                                           cutlass::MatrixCoord const& tile_coords)
+    {
+        // ctor - from MmaVoltaTensorOpAccumulatorTileIterator
+        TensorRef ref_(shared_storage.accum_ref());
+        int quad = (lane_id >> 2);
+        int lane_in_quad = (lane_id & 3);
+        int accum_m, accum_n;
+
+        if (cutlass::platform::is_same<Element, float>::value) {
+            // (quad[2],quad[0])+lane_in_quad[0]
+            accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+            // (quad[1])+lane_in_quad[1]
+            accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+                      (lane_in_quad & 2);
+        } else {
+            accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + lane_in_quad;  // (quad[2],quad[0])
+            accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+        }
+        cutlass::MatrixCoord lane_offset(accum_m, accum_n);
+
+        // Tile offset
+        ref_.add_coord_offset(tile_coords * cutlass::MatrixCoord({IteratorC::Shape::kRow,
+                                                                  IteratorC::Shape::kColumn}));
+
+        using AccessType = cutlass::Array<scalar_t, EleShapePerPatial::kColumn>;
+
+        // store - from MmaVoltaTensorOpAccumulatorTileIterator
+        CUTLASS_PRAGMA_UNROLL
+        for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+                        int mma_accum_start = (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                                                    Policy::MmaIterations::kColumn +
+                                                mma_n) *
+                                                   Policy::MmaIterations::kRow +
+                                               mma_m) *
+                                              kElementsPerMma;
+
+                        CUTLASS_PRAGMA_UNROLL
+                        for (int p = 0; p < kAccumulatorPatials; ++p) {
+                            CUTLASS_PRAGMA_UNROLL
+                            for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+                                int accum_m = tile_m * Policy::InterleavedTile::kRow +
+                                              mma_m * QuadShapePerPatialMma::kRow + m * 2;
+                                int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                                              mma_n * QuadShapePerPatialMma::kColumn +
+                                              p * Policy::InterleavedTile::kColumn / 2;
+                                int r = (accum_m + lane_offset.row());
+                                AccessType to_store;
+                                CUTLASS_PRAGMA_UNROLL
+                                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                                    int idx = mma_accum_start + p * kElementsPerPartial +
+                                              m * EleShapePerPatial::kColumn + n;
+                                    int c = (accum_n + n + lane_offset.column());
+                                    to_store[n] = scalar_t(accum[idx]);
+                                }
+                                int c = (accum_n + lane_offset.column());
+                                assert(r < 32);
+                                assert(c < 32);
+                                *reinterpret_cast<AccessType*>(ref_.data() + ref_.offset({r, c})) =
+                                    to_store;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    static void CUTLASS_DEVICE accumApplyLSEToSmem(AccumulatorSharedStorage& shared_storage,
+                                                   typename IteratorC::Fragment& accum,
+                                                   lse_scalar_t const* lse,
+                                                   int lse_extent,
+                                                   int thread_id,
+                                                   int warp_id,
+                                                   int lane_id,
+                                                   cutlass::MatrixCoord const& tile_coords)
+    {
+        // Non-optimized way to apply LSE to registers
+        // NOTE: accum is attn.T
+        // TODO: Optimize for each architecture
+        static constexpr int WarpSize = 32;
+        using AccumLambdaIterator =
+            typename DefaultMmaAccumLambdaIterator<IteratorC, accum_t, WarpSize>::Iterator;
+        auto lane_offset = AccumLambdaIterator::get_lane_offset(lane_id, warp_id, tile_coords);
+
+        cutlass::Array<lse_scalar_t, IteratorC::Fragment::kElements> lse_prefetched;
+        lse_prefetched.clear();
+        int rowIdx = 0;
+        int colIdx = 0;
+        AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {
+                ++rowIdx;
+                colIdx = 0;
+            },
+            [&](int accum_m, int accum_n, int idx) {
+                if (rowIdx == 1) {
+                    lse_prefetched[colIdx] = accum_n < lse_extent
+                                                 ? lse[accum_n]
+                                                 : platform::numeric_limits<accum_t>::infinity();
+                }
+                accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]);
+                ++colIdx;
+            },
+            [&](int accum_m) {});
+        accumToSmem(shared_storage, accum, lane_id, tile_coords);
+    }
+};
+
+// Simt Specialization
+// for f32 on Sm70-Sm75 and f16/f32 below
+
+template <typename Operator,
+          typename OperatorPolicy,
+          typename scalar_t,
+          typename WarpShape_,
+          typename ThreadblockShape_>
+struct B2bGemm<cutlass::gemm::warp::MmaSimtTileIterator<cutlass::MatrixShape<32, 32>,
+                                                        cutlass::gemm::Operand::kC,
+                                                        float,
+                                                        cutlass::layout::RowMajor,
+                                                        OperatorPolicy,
+                                                        1,
+                                                        1>,
+               Operator,
+               scalar_t,
+               WarpShape_,
+               ThreadblockShape_> {
+    using IteratorC = cutlass::gemm::warp::MmaSimtTileIterator<cutlass::MatrixShape<32, 32>,
+                                                               cutlass::gemm::Operand::kC,
+                                                               float,
+                                                               cutlass::layout::RowMajor,
+                                                               OperatorPolicy,
+                                                               1,
+                                                               1>;
+    using accum_t = typename IteratorC::Element;
+    using WarpShape = WarpShape_;
+    using ThreadblockShape = ThreadblockShape_;
+    using FragmentC = typename IteratorC::Fragment;
+    using lse_scalar_t = float;
+
+    // Storage in shared-memory for Q.Kt
+    using AccumulatorSharedStorage =
+        cutlass::gemm::threadblock::AccumulatorSharedStorage<ThreadblockShape,
+                                                             scalar_t,
+                                                             cutlass::layout::ColumnMajor,
+                                                             cutlass::MatrixShape<0, 0>  // Padding
+                                                             >;
+
+    static void CUTLASS_DEVICE accumToSmem(AccumulatorSharedStorage& shared_storage,
+                                           FragmentC const& accum,
+                                           int lane_id,
+                                           cutlass::MatrixCoord const& tile_coords)
+    {
+        using Policy = typename IteratorC::Policy;
+        using Element = typename IteratorC::Element;
+        using Iterations = typename IteratorC::Iterations;
+        using Delta = typename IteratorC::Delta;
+
+        auto ref_ = shared_storage.accum_ref();
+        // ctor - MmaSimtTileIterator
+        // compute offset based on thread ID and lane layout
+        typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+        MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
+                                  MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
+
+        ref_.add_coord_offset(lane_offset);
+
+        // Tile offset
+        ref_.add_coord_offset(tile_coords * cutlass::MatrixCoord({IteratorC::Shape::kRow,
+                                                                  IteratorC::Shape::kColumn}));
+
+        // store - MmaSimtTileIterator
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+                        int r = Policy::LaneMmaShape::kM * (mma_m * Policy::WarpShape::kRow) + m;
+                        int c = mma_n * Delta::kColumn + n;
+                        int idx = n + Policy::LaneMmaShape::kN *
+                                          (mma_n + Iterations::kColumn *
+                                                       (m + mma_m * Policy::LaneMmaShape::kM));
+                        ref_.at({r, c}) = scalar_t(accum[idx]);
+                    }
+                }
+            }
+        }
+    }
+
+    static void CUTLASS_DEVICE accumApplyLSEToSmem(AccumulatorSharedStorage& shared_storage,
+                                                   typename IteratorC::Fragment& accum,
+                                                   lse_scalar_t const* lse,
+                                                   int lse_extent,
+                                                   int thread_id,
+                                                   int warp_id,
+                                                   int lane_id,
+                                                   cutlass::MatrixCoord const& tile_coords)
+    {
+        // Non-optimized way to apply LSE to registers
+        // NOTE: accum is attn.T
+        // TODO: Optimize for each architecture
+        static constexpr int WarpSize = 32;
+        using AccumLambdaIterator =
+            typename DefaultMmaAccumLambdaIterator<IteratorC, accum_t, WarpSize>::Iterator;
+        auto lane_offset = AccumLambdaIterator::get_lane_offset(lane_id, warp_id, tile_coords);
+
+        cutlass::Array<lse_scalar_t, IteratorC::Fragment::kElements> lse_prefetched;
+        lse_prefetched.clear();
+        int rowIdx = 0;
+        int colIdx = 0;
+        AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {
+                ++rowIdx;
+                colIdx = 0;
+            },
+            [&](int accum_m, int accum_n, int idx) {
+                if (rowIdx == 1) {
+                    lse_prefetched[colIdx] = accum_n < lse_extent
+                                                 ? lse[accum_n]
+                                                 : platform::numeric_limits<accum_t>::infinity();
+                }
+                accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]);
+                ++colIdx;
+            },
+            [&](int accum_m) {});
+        accumToSmem(shared_storage, accum, lane_id, tile_coords);
+    }
+};
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
new file mode 100644
index 000000000000..2a4300c5cac1
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
@@ -0,0 +1,254 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <type_traits>
+#include "cutlass/arch/mma.h"
+
+template <typename arch, typename scalar_t>
+struct CheckArch {
+    static constexpr bool isPreVolta = arch::kMinComputeCapability < 70;
+    static constexpr bool isPreAmpere =
+        arch::kMinComputeCapability < 80 && arch::kMinComputeCapability >= 70;
+    static constexpr bool isAmpere = arch::kMinComputeCapability >= 80;
+#if defined(__CUDA_ARCH__)
+    static constexpr bool compiler_cc = arch::kMinComputeCapability * 10 <= __CUDA_ARCH__;
+#else
+    static constexpr bool compiler_cc = true;
+#endif
+    static constexpr bool value = (isPreVolta && std::is_same_v<scalar_t, float>) ||
+                                  (isPreAmpere && !std::is_same_v<scalar_t, cutlass::bfloat16_t>) ||
+                                  isAmpere && compiler_cc;
+};
+
+#define DISPATCH_ARCHTAG(CC, func)                                                      \
+    {                                                                                   \
+        if constexpr (GPU_ARCH >= 80) {                                                 \
+            if (CC >= 80) {                                                             \
+                using ArchTag = cutlass::arch::Sm80;                                    \
+                func;                                                                   \
+            } else {                                                                    \
+                EVOFORMER_CHECK(false, "Compile flag error. Unexpected GPU");           \
+            }                                                                           \
+        } else if constexpr (GPU_ARCH >= 75) {                                          \
+            if (CC >= 75) {                                                             \
+                using ArchTag = cutlass::arch::Sm75;                                    \
+                func;                                                                   \
+            } else {                                                                    \
+                EVOFORMER_CHECK(false, "Compile flag error. Unexpected GPU");           \
+            }                                                                           \
+        } else if constexpr (GPU_ARCH >= 70) {                                          \
+            if (CC >= 70) {                                                             \
+                using ArchTag = cutlass::arch::Sm70;                                    \
+                func;                                                                   \
+            } else {                                                                    \
+                EVOFORMER_CHECK(false, "Compile flag error. Unexpected GPU");           \
+            }                                                                           \
+        } else {                                                                        \
+            EVOFORMER_CHECK(false, "Only GPUs with Tensor Core are supported for now"); \
+        }                                                                               \
+    }
+
+#define DISPATCH_TYPES(tensor, func)                                              \
+    {                                                                             \
+        if (tensor.scalar_type() == at::ScalarType::Half) {                       \
+            using scalar_t = cutlass::half_t;                                     \
+            using torch_scalar_t = at::Half;                                      \
+            func;                                                                 \
+        } else if (tensor.scalar_type() == at::ScalarType::BFloat16) {            \
+            using scalar_t = cutlass::bfloat16_t;                                 \
+            using torch_scalar_t = at::BFloat16;                                  \
+            func;                                                                 \
+        } else {                                                                  \
+            EVOFORMER_CHECK(false, "Only fp16 and bf16 supported at the moment"); \
+        }                                                                         \
+    }
+
+#define DISPATCH_BOOL(BOOL_V, BOOL_NAME, F)   \
+    {                                         \
+        if (BOOL_V) {                         \
+            constexpr bool BOOL_NAME = true;  \
+            F();                              \
+        } else {                              \
+            constexpr bool BOOL_NAME = false; \
+            F();                              \
+        }                                     \
+    }
+
+#ifdef TORCH_CHECK
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \
+    EVOFORMER_CHECK(uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned")
+#define EVOFORMER_CHECK TORCH_CHECK
+#elif defined(__CUDACC_RTC__)
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \
+    if (!(uint64_t(PTR) % ALIGNMENT == 0)) { return false; }
+#define EVOFORMER_CHECK(COND, ERR) \
+    if (!(COND)) { return false; }
+#else
+#include <iostream>
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT)                \
+    if (!(uint64_t(PTR) % ALIGNMENT == 0)) {             \
+        std::cerr << #PTR " is not correctly aligned\n"; \
+        return false;                                    \
+    }
+#define EVOFORMER_CHECK(COND, ERR)                          \
+    if (!(COND)) {                                          \
+        std::cerr << "[Evoformer Attention]"                \
+                  << "'" #COND "' failed: " << ERR << "\n"; \
+        return false;                                       \
+    }
+#endif
+
+namespace gemm_kernel_utils {
+
+template <typename integer>
+constexpr CUTLASS_HOST_DEVICE integer ceil_div(integer n, integer m)
+{
+    return (n + m - 1) / m;
+}
+
+template <typename integer>
+constexpr CUTLASS_HOST_DEVICE integer align_up(integer n, integer m)
+{
+    return ((n + m - 1) / m) * m;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Determine the type of GEMM we do (TensorCores or not, Shapes ...)
+// TODO: Maybe we could rely on Cutlass's DefaultGemm templates
+////////////////////////////////////////////////////////////////////////////////
+
+// Fallback to Simt (FMA on cuda cores) if not in a special case below
+template <typename ArchTag, typename scalar_t_, typename Enable = void>
+struct DefaultGemmType {
+    static constexpr int ThreadK = 8;
+    static constexpr int WarpK = 8;
+    static constexpr int kMinimumAlignment = 1;
+    using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+    using OpClass = cutlass::arch::OpClassSimt;
+    using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f32
+template <typename ArchTag>
+struct DefaultGemmType<
+    ArchTag,
+    float,
+    typename cutlass::platform::enable_if<ArchTag::kMinComputeCapability >= 80>::type> {
+    static constexpr int ThreadK = 32;
+    static constexpr int WarpK = 32;
+    static constexpr int kMinimumAlignment = 4;
+    using OpClass = cutlass::arch::OpClassTensorOp;
+    using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+    using Operator = cutlass::arch::OpMultiplyAddFastF32;
+};
+
+// Specialization for tensorcores with f16/bf16 - Sm75+
+template <typename ArchTag, typename scalar_t>
+struct DefaultGemmType<
+    ArchTag,
+    scalar_t,
+    typename cutlass::platform::enable_if<ArchTag::kMinComputeCapability >= 75 &&
+                                          cutlass::sizeof_bits<scalar_t>::value == 16>::type> {
+    static constexpr int ThreadK = 32;
+    static constexpr int WarpK = 32;
+    static constexpr int kMinimumAlignment = 4;
+    using OpClass = cutlass::arch::OpClassTensorOp;
+    using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+    using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f16 - Volta
+template <>
+struct DefaultGemmType<cutlass::arch::Sm70, cutlass::half_t, void> {
+    static constexpr int ThreadK = 32;
+    static constexpr int WarpK = 32;
+    static constexpr int kMinimumAlignment = 2;
+    using OpClass = cutlass::arch::OpClassTensorOp;
+    using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+    using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Enables to do
+// `auto x = kCondition ? fa(arg) : fb(arg)`
+// when `fa` and `fb` have different types
+template <bool kVal, typename TA, typename TB>
+struct call_conditional;
+
+template <typename TA, typename TB>
+struct call_conditional<true, TA, TB> {
+    template <typename Arg>
+    static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg) -> decltype(ta(arg))
+    {
+        return ta(arg);
+    }
+};
+
+template <typename TA, typename TB>
+struct call_conditional<false, TA, TB> {
+    template <typename Arg>
+    static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg) -> decltype(tb(arg))
+    {
+        return tb(arg);
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Mark a variable as warp-uniform - enables some compiler optimizations
+// The cheapest way to do it is just to broadcast it from lane 0
+////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_DEVICE int32_t warp_uniform(int32_t value)
+{
+    return (int32_t)__shfl_sync(0xffffffff, (unsigned)value, 0);
+}
+
+template <typename T>
+CUTLASS_DEVICE T* warp_uniform(T* ptr)
+{
+    struct {
+        union {
+            T* ptr;
+            uint32_t asInt[2];
+        };
+    } p;
+    p.ptr = ptr;
+    p.asInt[0] = warp_uniform(p.asInt[0]);
+    p.asInt[1] = warp_uniform(p.asInt[1]);
+    return p.ptr;
+}
+}  // namespace gemm_kernel_utils
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/epilogue_predicated_tile_iterator.h b/csrc/deepspeed4science/evoformer_attn/iterators/epilogue_predicated_tile_iterator.h
new file mode 100644
index 000000000000..667f1982d30d
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/epilogue_predicated_tile_iterator.h
@@ -0,0 +1,691 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+  \brief Epilogue iterator that supports prefetching
+
+  Mostly copied from "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in
+/// epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator |
+/// ForwardTileIterator
+///
+template <typename ThreadMap_,    ///< Thread map (conept: OutputTileThreadMap)
+          typename Element_,      ///< Element data type
+          bool ScatterD = false,  ///< Scatter D operand or not
+          bool UseCUDAStore = false>
+class PredicatedTileIteratorPrefetch {
+public:
+    using ThreadMap = ThreadMap_;
+    using Shape = typename ThreadMap::Shape;
+
+    using Element = Element_;
+
+    using Layout = layout::RowMajor;
+    using TensorRef = TensorRef<Element, Layout>;
+    using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+    using TensorCoord = MatrixCoord;
+
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+    static int const kThreads = ThreadMap::kThreads;
+    static int const kIterations = ThreadMap::Count::kTile;
+
+    static_assert(ThreadMap::Iterations::kRow > 0, "ThreadMap::Iterations::kRow must be > 0");
+    static_assert(ThreadMap::Iterations::kGroup > 0, "ThreadMap::Iterations::kGroup must be > 0");
+    static_assert(ThreadMap::Iterations::kCluster > 0,
+                  "ThreadMap::Iterations::kCluster must be > 0");
+    static_assert(ThreadMap::Iterations::kColumn > 0, "ThreadMap::Iterations::kColumn must be > 0");
+
+    /// Fragment object
+    using Fragment = Array<Element,
+                           ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+                               ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+                               ThreadMap::kElementsPerAccess>;
+
+    /// Memory access size
+    using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+    //
+    // Parameters struct
+    //
+
+    /// Uses a non-template class
+    struct Params : PredicatedTileIteratorParams {
+        using Base = PredicatedTileIteratorParams;
+
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout)
+            : PredicatedTileIteratorParams(
+                  layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+                  make_OutputTileThreadMapDesc<ThreadMap>())
+        {
+        }
+
+        CUTLASS_HOST_DEVICE
+        Params(Base const& base) : Base(base) {}
+    };
+
+    /// Mask object
+    struct Mask {
+        static int const kCount = ThreadMap::Iterations::kColumn;
+
+        /// Predicate state
+        bool predicates[kCount];
+
+        //
+        // Mask
+        //
+        CUTLASS_HOST_DEVICE
+        Mask() { enable(); }
+
+        ///< Efficiently disables all accesses guarded by mask
+        CUTLASS_HOST_DEVICE void clear()
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < kCount; ++i) { predicates[i] = false; }
+        }
+
+        ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+        CUTLASS_DEVICE void enable()
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < kCount; ++i) { predicates[i] = true; }
+        }
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Parameters structure containing reference and precomputed state.
+    PredicatedTileIteratorParams params_;
+
+    /// Byte-level pointer
+    uint8_t* byte_pointer_;
+
+    /// Array of boolean values to contain steady-state predicates
+    Mask mask_;
+
+    /// Extent of the matrix tile in rows
+    Index extent_row_;
+
+    /// Extent of the matrix tile in rows
+    Index extent_column_;
+
+    /// A thread's starting row position (assuming steady-state predicates have
+    /// been computed)
+    Index thread_start_row_;
+
+    /// A thread's starting column
+    Index thread_start_column_;
+
+    /// Internal state counter
+    int state_[3];
+
+    /// Scatter indices
+    int const* indices_;
+
+    //
+    // Static asserts about internal strides
+    //
+
+    static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+    static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+    static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
+
+private:
+    //
+    // Methods
+    //
+
+public:
+    //
+    // Methods
+    //
+
+    /// Constructor
+    CUTLASS_DEVICE
+    PredicatedTileIteratorPrefetch(PredicatedTileIteratorParams const& params,
+                                   Element* pointer,
+                                   TensorCoord extent,
+                                   int thread_idx,
+                                   TensorCoord threadblock_offset = TensorCoord(),
+                                   int const* indices = nullptr)
+        : params_(params), indices_(indices)
+    {
+        TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+        extent_row_ = extent.row();
+        extent_column_ = extent.column();
+
+        thread_start_row_ = thread_offset.row();
+        thread_start_column_ = thread_offset.column();
+
+        // Initialize predicates
+        CUTLASS_PRAGMA_UNROLL
+        for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+            mask_.predicates[c] =
+                ((thread_offset.column() + ThreadMap::Delta::kColumn * c) < extent.column());
+        }
+
+        // Null pointer performs no accesses
+        if (!pointer) { mask_.clear(); }
+
+        if (ScatterD && !indices) { mask_.clear(); }
+
+        // Initialize pointer
+        byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+                        LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
+                        LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+
+        if (ScatterD) {
+            byte_pointer_ =
+                reinterpret_cast<uint8_t*>(pointer) +
+                LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+        }
+
+        // Initialize internal state counter
+        state_[0] = state_[1] = state_[2] = 0;
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+    }
+
+    CUTLASS_DEVICE
+    void prefetch_all()
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int iter = 0; iter < kIterations; ++iter) {
+            prefetch();
+            ++(*this);
+        }
+    }
+
+    CUTLASS_DEVICE
+    void prefetch()
+    {
+        uint8_t* byte_pointer = byte_pointer_;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+                    int row_offset = row * ThreadMap::Delta::kRow +
+                                     group * ThreadMap::Delta::kGroup +
+                                     cluster * ThreadMap::Delta::kCluster;
+
+                    AccessType* memory_pointer = reinterpret_cast<AccessType*>(byte_pointer);
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+                        // on windows using unsigned long here gives the error
+                        // error: asm operand type size(4) does not match
+                        // type/size implied by constraint 'l'
+                        uint64_t addr =
+                            (uint64_t)((void*)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                                              kElementsPerAccess]);
+                        asm volatile("prefetch.global.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
+                    }
+
+                    if (row + 1 < ThreadMap::Iterations::kRow) {
+                        if (!ScatterD) { byte_pointer += params_.increment_row; }
+                    }
+                }
+
+                if (group + 1 < ThreadMap::Iterations::kGroup) {
+                    byte_pointer += params_.increment_group;
+                }
+            }
+
+            if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+                byte_pointer += params_.increment_cluster;
+            }
+        }
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_byte_offset(Fragment& frag, int64_t byte_offset) const
+    {
+        uint8_t* byte_pointer = byte_pointer_;
+        AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+                    int frag_row_idx =
+                        (row + ThreadMap::Iterations::kRow *
+                                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+                    int row_offset = row * ThreadMap::Delta::kRow +
+                                     group * ThreadMap::Delta::kGroup +
+                                     cluster * ThreadMap::Delta::kCluster;
+
+                    bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+                    AccessType* memory_pointer =
+                        reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+                    if (ScatterD && row_guard) {
+                        assert(indices_);
+
+                        memory_pointer = reinterpret_cast<AccessType*>(
+                            byte_pointer + byte_offset +
+                            LongIndex(indices_[row_offset + thread_start_row_]) *
+                                LongIndex(params_.stride));
+                    }
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+                        bool guard = row_guard && mask_.predicates[column];
+
+                        cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                            frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                            (void*)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                                   kElementsPerAccess],
+                            guard);
+                    }
+
+                    if (row + 1 < ThreadMap::Iterations::kRow) {
+                        if (!ScatterD) { byte_pointer += params_.increment_row; }
+                    }
+                }
+
+                if (group + 1 < ThreadMap::Iterations::kGroup) {
+                    byte_pointer += params_.increment_group;
+                }
+            }
+
+            if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+                byte_pointer += params_.increment_cluster;
+            }
+        }
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) const { load_with_byte_offset(frag, 0); }
+
+    /// Stores a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, int64_t byte_offset) const
+    {
+        uint8_t* byte_pointer = byte_pointer_;
+        AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+                    int frag_row_idx =
+                        (row + ThreadMap::Iterations::kRow *
+                                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+                    int row_offset = row * ThreadMap::Delta::kRow +
+                                     group * ThreadMap::Delta::kGroup +
+                                     cluster * ThreadMap::Delta::kCluster;
+
+                    bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+                    AccessType* memory_pointer =
+                        reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+                    if (ScatterD && row_guard) {
+                        assert(indices_);
+
+                        memory_pointer = reinterpret_cast<AccessType*>(
+                            byte_pointer + byte_offset +
+                            LongIndex(indices_[row_offset + thread_start_row_]) *
+                                LongIndex(params_.stride));
+                    }
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+                        bool guard = row_guard && mask_.predicates[column];
+
+                        if (UseCUDAStore) {
+                            if (guard) {
+                                memory_pointer[column * ThreadMap::Delta::kColumn /
+                                               kElementsPerAccess] =
+                                    frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                                             column];
+                            }
+                        } else {
+                            cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                                (void*)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                                       kElementsPerAccess],
+                                guard);
+                        }
+                    }
+
+                    if (row + 1 < ThreadMap::Iterations::kRow) {
+                        if (!ScatterD) { byte_pointer += params_.increment_row; }
+                    }
+                }
+
+                if (group + 1 < ThreadMap::Iterations::kGroup) {
+                    byte_pointer += params_.increment_group;
+                }
+            }
+
+            if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+                byte_pointer += params_.increment_cluster;
+            }
+        }
+    }
+
+    /// Stores a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) const { store_with_byte_offset(frag, 0); }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void downsample_load_with_byte_offset(Fragment& frag,
+                                          int64_t byte_offset,
+                                          int convolution_P,
+                                          int convolution_Q,
+                                          int add_P,
+                                          int add_Q,
+                                          int problem_N) const
+    {
+        uint8_t* byte_pointer = byte_pointer_;
+        AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+                    int frag_row_idx =
+                        (row + ThreadMap::Iterations::kRow *
+                                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+                    int row_offset = row * ThreadMap::Delta::kRow +
+                                     group * ThreadMap::Delta::kGroup +
+                                     cluster * ThreadMap::Delta::kCluster;
+
+                    bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+                    int output_row = row_offset + thread_start_row_;
+                    int output_N = output_row / (convolution_P * convolution_Q);
+                    int output_PQ = output_row % (convolution_P * convolution_Q);
+                    int output_P = output_PQ / convolution_Q;
+                    int output_Q = output_PQ % convolution_Q;
+
+                    int input_row = output_N * 2 * convolution_P * 2 * convolution_Q +
+                                    (2 * output_P + add_P) * 2 * convolution_Q + 2 * output_Q +
+                                    add_Q;
+
+                    int64_t byte_offset = (input_row - output_row) * problem_N * sizeof(float);
+
+                    AccessType* memory_pointer =
+                        reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+                        bool guard = row_guard && mask_.predicates[column];
+
+                        cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                            frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                            (void*)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                                   kElementsPerAccess],
+                            guard);
+                    }
+
+                    if (row + 1 < ThreadMap::Iterations::kRow) {
+                        byte_pointer += params_.increment_row;
+                    }
+                }
+
+                if (group + 1 < ThreadMap::Iterations::kGroup) {
+                    byte_pointer += params_.increment_group;
+                }
+            }
+
+            if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+                byte_pointer += params_.increment_cluster;
+            }
+        }
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void upsample_load_with_byte_offset(Fragment& frag,
+                                        int64_t byte_offset,
+                                        int convolution_P,
+                                        int convolution_Q,
+                                        int add_P,
+                                        int add_Q,
+                                        int problem_N) const
+    {
+        uint8_t* byte_pointer = byte_pointer_;
+        AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+                    int frag_row_idx =
+                        (row + ThreadMap::Iterations::kRow *
+                                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+                    int row_offset = row * ThreadMap::Delta::kRow +
+                                     group * ThreadMap::Delta::kGroup +
+                                     cluster * ThreadMap::Delta::kCluster;
+
+                    bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+                    int output_row = row_offset + thread_start_row_;
+                    int output_N = output_row / (convolution_P * convolution_Q);
+                    int output_PQ = output_row % (convolution_P * convolution_Q);
+                    int output_P = output_PQ / convolution_Q;
+                    int output_Q = output_PQ % convolution_Q;
+                    int row_add_P = add_P;
+                    int row_add_Q = add_Q;
+                    if (output_P > convolution_P - 2) row_add_P = 0;
+                    if (output_Q > convolution_Q - 2) row_add_Q = 0;
+
+                    int input_row = output_N * (convolution_P / 2) * (convolution_Q / 2) +
+                                    ((output_P + row_add_P) / 2) * (convolution_Q / 2) +
+                                    (output_Q + row_add_Q) / 2;
+
+                    int64_t byte_offset = (input_row - output_row) * problem_N * sizeof(float);
+
+                    AccessType* memory_pointer =
+                        reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+                        bool guard = row_guard && mask_.predicates[column];
+
+                        cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                            frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                            (void*)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                                   kElementsPerAccess],
+                            guard);
+                    }
+
+                    if (row + 1 < ThreadMap::Iterations::kRow) {
+                        byte_pointer += params_.increment_row;
+                    }
+                }
+
+                if (group + 1 < ThreadMap::Iterations::kGroup) {
+                    byte_pointer += params_.increment_group;
+                }
+            }
+
+            if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+                byte_pointer += params_.increment_cluster;
+            }
+        }
+    }
+
+    CUTLASS_DEVICE
+    MatrixCoord thread_start() const
+    {
+        return MatrixCoord(thread_start_row_, thread_start_column_);
+    }
+
+    /// Need to get the thread start row from the tile iterator
+    CUTLASS_DEVICE
+    int32_t thread_start_row() const { return thread_start_row_; }
+
+    /// Need to get the thread start row from the tile iterator
+    CUTLASS_DEVICE
+    int32_t thread_start_column() const { return thread_start_column_; }
+
+    /// Extent of the matrix in rows
+    CUTLASS_DEVICE
+    Index extent_row() const { return extent_row_; }
+
+    /// Extent of the matrix in columns
+    CUTLASS_DEVICE
+    Index extent_column() const { return extent_column_; }
+
+    /// Advances to the next position to load or store
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorPrefetch& operator++()
+    {
+        ++state_[0];
+
+        if (!ScatterD) { byte_pointer_ += params_.advance_row; }
+
+        thread_start_row_ += ThreadMap::Shape::kRow;
+
+        if (state_[0] == ThreadMap::Count::kRow) {
+            state_[0] = 0;
+            ++state_[1];
+            byte_pointer_ += params_.advance_group;
+
+            thread_start_row_ +=
+                (ThreadMap::Shape::kGroup - 1) * ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+            if (state_[1] == ThreadMap::Count::kGroup) {
+                state_[1] = 0;
+                ++state_[2];
+                byte_pointer_ += params_.advance_cluster;
+
+                thread_start_row_ += ThreadMap::Count::kGroup * ThreadMap::Shape::kGroup *
+                                     ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+                if (state_[2] == ThreadMap::Count::kCluster) {
+                    state_[2] = 0;
+                    byte_pointer_ += params_.advance_tile;
+                }
+            }
+        }
+
+        return *this;
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_DEVICE void clear_mask() { mask_.clear(); }
+
+    ///< Efficiently enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable_mask() { mask_.enable(); }
+
+    ///< Sets the mask
+    CUTLASS_DEVICE void get_mask(Mask& mask) const { mask = mask_; }
+
+    ///< Sets the mask
+    CUTLASS_DEVICE void set_mask(Mask const& mask) { mask_ = mask; }
+};
+
+template <typename IT>
+struct MakePrefetchableIterator {
+    using Iterator = PredicatedTileIteratorPrefetch<typename IT::ThreadMap, typename IT::Element>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/make_residual_last.h b/csrc/deepspeed4science/evoformer_attn/iterators/make_residual_last.h
new file mode 100644
index 000000000000..ff0e324c3a6c
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/make_residual_last.h
@@ -0,0 +1,91 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "predicated_tile_access_iterator_residual_last.h"
+#include "predicated_tile_iterator_residual_last.h"
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+template <typename BaseIterator>
+struct MakeIteratorResidualLast;
+
+template <typename Shape,
+          typename Element,
+          typename Layout,
+          int AdvanceRank,
+          typename ThreadMap,
+          int AccessSize,
+          bool Gather>
+struct MakeIteratorResidualLast<
+    PredicatedTileIterator<Shape, Element, Layout, AdvanceRank, ThreadMap, AccessSize, Gather>> {
+    using Iterator = PredicatedTileIteratorResidualLast<Shape,
+                                                        Element,
+                                                        Layout,
+                                                        AdvanceRank,
+                                                        ThreadMap,
+                                                        AccessSize,
+                                                        Gather>;
+};
+
+template <typename Shape,
+          typename Element,
+          typename Layout,
+          int AdvanceRank,
+          typename ThreadMap,
+          typename AccessType,
+          bool Gather>
+struct MakeIteratorResidualLast<PredicatedTileAccessIterator<Shape,
+                                                             Element,
+                                                             Layout,
+                                                             AdvanceRank,
+                                                             ThreadMap,
+                                                             AccessType,
+                                                             Gather>> {
+    using Iterator = PredicatedTileAccessIteratorResidualLast<Shape,
+                                                              Element,
+                                                              Layout,
+                                                              AdvanceRank,
+                                                              ThreadMap,
+                                                              AccessType,
+                                                              Gather>;
+};
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h
new file mode 100644
index 000000000000..7f6a2430845a
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h
@@ -0,0 +1,1964 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+    from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile
+   this iterator visits maybe partial, then the remaining tiles are complete.
+   So, we only need to compute the predicates twice, once before the first tile
+   and once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+    stored in registers, and integer addition is used to advance the pointer
+    through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIteratorResidualLast
+///
+template <typename Shape,
+          typename Element,
+          typename Layout,
+          int AdvanceRank,
+          typename ThreadMap,
+          typename AccessType,
+          bool Gather = false>
+class PredicatedTileAccessIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for pitch-linear
+/// data.
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          typename AccessType_,
+          bool Gather>
+class PredicatedTileAccessIteratorResidualLast<Shape_,
+                                               Element_,
+                                               layout::PitchLinear,
+                                               AdvanceRank,
+                                               ThreadMap_,
+                                               AccessType_,
+                                               Gather> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::PitchLinear;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+    using AccessType = AccessType_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<Shape,
+                                                                        Element,
+                                                                        Layout,
+                                                                        AdvanceRank,
+                                                                        ThreadMap,
+                                                                        AccessType>;
+
+    static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+    static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+                  "Vectors implied by the thread map must be divisible by the access type.");
+
+    using Mask = typename UnderlyingPredicates::Mask;
+
+    /// Uses a non-template class
+    struct Params : PredicatedTileAccessIteratorParams {
+        using Base = PredicatedTileAccessIteratorParams;
+
+        // Default ctor
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout)
+            : Base(layout.stride(0),
+                   MakePredicatedTileAccessIteratorDesc<Shape,
+                                                        Element,
+                                                        Layout,
+                                                        kAdvanceRank,
+                                                        ThreadMap>()())
+        {
+        }
+
+        CUTLASS_HOST_DEVICE
+        Params(Base const& base) : Base(base) {}
+    };
+
+private:
+    /// Internal pointer type permits fast address arithmetic
+    using BytePointer = char*;
+
+private:
+    //
+    // Data members
+    //
+
+    UnderlyingPredicates the_predicates;
+    Mask residual_tile_mask;
+
+    /// Parameters object with precomputed internal state
+    Params params_;
+
+    /// Internal pointer to first access of tile
+    BytePointer pointer_;
+
+    /// Below is used when Gather is turned on.  We need to record strided_offset
+    /// and contiguous_offset separated to compute the offset by using
+    ///
+    /// offset = contiguous_offset + indices[strided_offset]
+    ///
+
+    /// Gather indices
+    int const* indices_;
+
+    Index gather_offset_strided;
+
+private:
+    /// Computes predicates based on internally tracked per-thread offset.
+    CUTLASS_DEVICE
+    void compute_predicates_(
+        /// Extent of the matrix window
+        TensorCoord extent,
+        /// optionally, simplify predicate calculation during 'steady state' phase
+        bool is_steady_state = false)
+    {
+        the_predicates.compute_predicates_(extent, is_steady_state);
+    }
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        /// Precomputed parameters object
+        Params const& params,
+        /// Pointer to start of tensor
+        Pointer pointer,
+        /// Extent of tensor
+        TensorCoord extent,
+        /// ID of each participating thread
+        int thread_id,
+        /// Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        /// Gather indices
+        int const* indices = nullptr)
+        : params_(params),
+          pointer_(reinterpret_cast<BytePointer>(const_cast<NonConstPointer>(pointer))),
+          the_predicates(extent),
+          indices_(indices)
+    {
+        the_predicates.set_predicates(thread_id, threadblock_offset);
+        the_predicates.get_mask(residual_tile_mask);
+
+        // Working around a weird compiler bug happening on P100 for the backward.
+        // I've seen together: the_predicates.predicates_[0] = 14 (instead of 15)
+        // residual_tile_mask[0] = 15 (correct)
+        //
+        // Adding prints when the value is calculated (in `compute_predicates_`)
+        // sometimes removes the bug. The consequence is that we skip some
+        // element of a tensor, leading to wrong results
+        // Setting `compute_predicates_`'s second argument (`is_steady_state`) to
+        // true also seems to get rid of the bug - at the cost of twice as many
+        // comparisons.
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+        constexpr bool kWorkAroundCompilerBug = false;
+#else
+        constexpr bool kWorkAroundCompilerBug = true;
+#endif
+        the_predicates.compute_predicates_(extent, true && !kWorkAroundCompilerBug);
+
+        // update internal pointers
+        Layout layout(params_.stride_);
+
+        if (!Gather) {
+            add_pointer_offset(layout(the_predicates.thread_offset_));
+        } else {
+            gather_offset_strided = the_predicates.thread_offset_.strided();
+            add_pointer_offset(layout(make_Coord(the_predicates.thread_offset_.contiguous(), 0)));
+        }
+    }
+
+    /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        /// Precomputed parameters object
+        Params const& params,
+        /// Pointer to start of tensor
+        Pointer pointer,
+        /// Extent of tensor
+        TensorCoord extent,
+        ///< ID of each participating thread
+        int thread_id)
+        : PredicatedTileAccessIteratorResidualLast(params,
+                                                   pointer,
+                                                   extent,
+                                                   thread_id,
+                                                   make_Coord(0, 0))
+    {
+    }
+
+    /// Overrides the internal iteration index
+    CUTLASS_HOST_DEVICE
+    void set_iteration_index(int index) { the_predicates.set_iteration_index(index); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool is_residual_tile)
+    {
+        if (is_residual_tile) { the_predicates.set_mask(residual_tile_mask); }
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        if (!Gather) {
+            if (kAdvanceRank) {
+                pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+                pointer_ += Shape::kContiguous * tile_offset.contiguous();
+            } else {
+                pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+                pointer_ += Shape::kStrided * tile_offset.strided();
+            }
+        } else {
+            add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+            gather_offset_strided += Shape::kStrided * tile_offset.strided();
+        }
+    }
+
+    /// Returns a pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get() const
+    {
+        if (Gather) {
+            assert(indices_);
+
+            if (!valid()) { return nullptr; }
+
+            LongIndex contiguous_offset =
+                the_predicates.iteration_contiguous_ *
+                    (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value / 8) +
+                the_predicates.iteration_vector_;
+            int strided_index = gather_offset_strided +
+                                the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
+
+            LongIndex strided_offset = indices_[strided_index] * LongIndex(params_.stride_) *
+                                       sizeof_bits<Element>::value / 8;
+
+            return reinterpret_cast<AccessType*>(pointer_ + contiguous_offset + strided_offset);
+        }
+
+        return reinterpret_cast<AccessType*>(
+                   pointer_ + the_predicates.iteration_contiguous_ *
+                                  (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) /
+                                  8) +
+               the_predicates.iteration_vector_;
+    }
+
+    /// Increment and return an instance to self.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast& operator++()
+    {
+        the_predicates.operator++();
+
+        ++the_predicates.iteration_vector_;
+        if (the_predicates.iteration_vector_ < kAccessesPerVector) { return *this; }
+
+        the_predicates.iteration_vector_ = 0;
+        ++the_predicates.iteration_contiguous_;
+
+        if (the_predicates.iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+            return *this;
+        }
+
+        // Enter here only if (iteration_contiguous_ ==
+        // ThreadMap::Iteration::kContiguous)
+        the_predicates.iteration_contiguous_ = 0;
+        ++the_predicates.iteration_strided_;
+
+        if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+            if (!Gather) { pointer_ += params_.inc_strided_; }
+
+            return *this;
+        }
+
+        // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+        // which means we enter the next tile.
+        the_predicates.iteration_strided_ = 0;
+
+        if (!Gather) {
+            // advance to next tile
+            pointer_ += params_.inc_next_;
+
+            // now return to start tile - if the iterator is subsequently advanced,
+            // this subtraction as well as the subsequent integer addition are both
+            // elided by the compiler.
+            pointer_ -= params_.inc_advance_;
+        }
+
+        return *this;
+    }
+
+    /// Increment and return an instance to self.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast operator++(int)
+    {
+        PredicatedTileAccessIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { the_predicates.clear_mask(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { the_predicates.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { the_predicates.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { the_predicates.get_mask(mask); }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() const { return the_predicates.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          typename AccessType_,
+          bool Gather>
+class PredicatedTileAccessIteratorResidualLast<Shape_,
+                                               Element_,
+                                               layout::ColumnMajor,
+                                               AdvanceRank,
+                                               ThreadMap_,
+                                               AccessType_,
+                                               Gather> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::ColumnMajor;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+    using AccessType = AccessType_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+        layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+        Element,
+        layout::PitchLinear,
+        (kAdvanceRank == 0 ? 0 : 1),
+        ThreadMap,
+        AccessType,
+        Gather>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileAccessIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        /// Default ctor
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))){};
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(typename UnderlyingIterator::Params::Base const& base) : params_(base) {}
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying pitch-linear tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        ///< Precomputed parameters object
+        Params const& params,
+        ///< Pointer to start of tensor
+        Pointer pointer,
+        ///< Extent of tensor
+        TensorCoord extent,
+        ///< ID of each participating thread
+        int thread_id,
+        ///< Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.row(), extent.column()),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column()),
+                    indices)
+    {
+    }
+
+    /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        Params const& params,  ///< Precomputed parameters object
+        Pointer pointer,       ///< Pointer to start of tensor
+        TensorCoord extent,    ///< Extent of tensor
+        int thread_id          ///< ID of each participating thread
+        )
+        : PredicatedTileAccessIteratorResidualLast(params,
+                                                   pointer,
+                                                   extent,
+                                                   thread_id,
+                                                   make_Coord(0, 0))
+    {
+    }
+
+    /// Overrides the internal iteration index
+    CUTLASS_HOST_DEVICE
+    void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_HOST_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+    }
+
+    /// Returns a pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get() const { return reinterpret_cast<AccessType*>(iterator_.get()); }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast operator++(int)
+    {
+        PredicatedTileAccessIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          typename AccessType_,
+          bool Gather>
+class PredicatedTileAccessIteratorResidualLast<Shape_,
+                                               Element_,
+                                               layout::RowMajor,
+                                               AdvanceRank,
+                                               ThreadMap_,
+                                               AccessType_,
+                                               Gather> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::RowMajor;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+    using AccessType = AccessType_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+        layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+        Element,
+        layout::PitchLinear,
+        (kAdvanceRank == 0 ? 1 : 0),
+        ThreadMap,
+        AccessType,
+        Gather>;
+
+    static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileAccessIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        /// Default ctor
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))){};
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(typename UnderlyingIterator::Params::Base const& base) : params_(base) {}
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying pitch-linear tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        ///< Precomputed parameters object
+        Params const& params,
+        ///< Pointer to start of tensor
+        Pointer pointer,
+        ///< Extent of tensor
+        TensorCoord extent,
+        ///< ID of each participating thread
+        int thread_id,
+        ///< Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        /// Gather indices
+        int const* indices = nullptr)
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.column(), extent.row()),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row()),
+                    indices)
+    {
+    }
+
+    /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        Params const& params,  ///< Precomputed parameters object
+        Pointer pointer,       ///< Pointer to start of tensor
+        TensorCoord extent,    ///< Extent of tensor
+        int thread_id          ///< ID of each participating thread
+        )
+        : PredicatedTileAccessIteratorResidualLast(params,
+                                                   pointer,
+                                                   extent,
+                                                   thread_id,
+                                                   make_Coord(0, 0))
+    {
+    }
+
+    /// Overrides the internal iteration index
+    CUTLASS_HOST_DEVICE
+    void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_HOST_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+    }
+
+    /// Returns a pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get() const { return reinterpret_cast<AccessType*>(iterator_.get()); }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast operator++(int)
+    {
+        PredicatedTileAccessIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<Shape_,
+                                               Element_,
+                                               layout::AffineRankN<2>,
+                                               AdvanceRank,
+                                               ThreadMap_,
+                                               AccessType_,
+                                               false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::AffineRankN<2>;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+    using AccessType = AccessType_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<Shape,
+                                                                        Element,
+                                                                        layout::PitchLinear,
+                                                                        AdvanceRank,
+                                                                        ThreadMap,
+                                                                        AccessType>;
+
+    static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+    static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+                  "Vectors implied by the thread map must be divisible by the access type.");
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingPredicates::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    public:
+        friend PredicatedTileAccessIteratorResidualLast;
+
+    private:
+        /// stride of pitch-linear layout (units of Element)
+        Coord<Layout::kStrideRank, Layout::LongIndex> stride_;
+        /// amount (in byte) to increment pointer to move to next access along
+        /// contiguous dimension
+        LongIndex inc_contiguous_;
+        /// amount (in byte) to increment pointer from first access of current
+        /// contiguous dimension to first access of next one.
+        LongIndex inc_strided_;
+        /// amount (in byte) to increment pointer from last access of current
+        /// contiguous dimension to first access of next one.
+        LongIndex inc_next_strided_;
+        /// amount (in byte) to increment pointer from last access to first access
+        /// of next tile
+        LongIndex inc_next_;
+        /// amount (in byte) to increment pointer from first access of current tile
+        /// to first access of next tile
+        LongIndex inc_advance_;
+
+    public:
+        // Default ctor
+        CUTLASS_HOST_DEVICE
+        Params() : stride_(0), inc_contiguous_(0), inc_strided_(0), inc_next_(0), inc_advance_(0) {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : stride_({layout.stride(0), layout.stride(1)})
+        {
+            inc_contiguous_ = (LongIndex(stride_[0]) * ThreadMap::Delta::kContiguous) *
+                              sizeof_bits<Element>::value / 8;
+
+            inc_strided_ = (LongIndex(stride_[1]) * ThreadMap::Delta::kStrided) *
+                           sizeof_bits<Element>::value / 8;
+
+            inc_next_strided_ =
+                inc_strided_ - LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_;
+
+            if (kAdvanceRank) {
+                // advance along strided dimension
+                inc_advance_ =
+                    Shape::kStrided * LongIndex(stride_[1]) * sizeof_bits<Element>::value / 8;
+            } else {
+                // advance along contiguous dimension
+                inc_advance_ = Shape::kContiguous * stride_[0] * sizeof_bits<Element>::value / 8;
+            }
+
+            inc_next_ = inc_advance_ -
+                        LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_ -
+                        LongIndex(ThreadMap::Iterations::kStrided - 1) * inc_strided_;
+        };
+    };
+
+private:
+    /// Internal pointer type permits fast address arithmetic
+    using BytePointer = char*;
+
+    //
+    // Data members
+    //
+
+    /// Parameters object with precomputed internal state
+    Params params_;
+
+    /// Internal pointer to first access of tile
+    BytePointer pointer_;
+
+    UnderlyingPredicates the_predicates;
+    Mask residual_tile_mask;
+
+private:
+    /// Computes predicates based on internally tracked per-thread offset.
+    CUTLASS_DEVICE
+    void compute_predicates_(
+        /// Extent of the matrix window
+        TensorCoord extent,
+        /// optionally, simplify predicate calculation during 'steady state' phase
+        bool is_steady_state = false)
+    {
+        the_predicates.compute_predicates_(extent, is_steady_state);
+    }
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        ///< Precomputed parameters object
+        Params const& params,
+        ///< Pointer to start of tensor
+        Pointer pointer,
+        ///< Extent of tensor
+        TensorCoord extent,
+        ///< ID of each participating thread
+        int thread_id,
+        ///< Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : params_(params),
+          pointer_(reinterpret_cast<BytePointer>(const_cast<NonConstPointer>(pointer))),
+          the_predicates(extent)
+    {
+        the_predicates.set_predicates(thread_id, threadblock_offset);
+
+        // update internal pointers
+        Layout layout(params_.stride_);
+        add_pointer_offset(layout(the_predicates.thread_offset_));
+    }
+
+    /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        Params const& params,  ///< Precomputed parameters object
+        Pointer pointer,       ///< Pointer to start of tensor
+        TensorCoord extent,    ///< Extent of tensor
+        int thread_id          ///< ID of each participating thread
+        )
+        : PredicatedTileAccessIteratorResidualLast(params,
+                                                   pointer,
+                                                   extent,
+                                                   thread_id,
+                                                   make_Coord(0, 0))
+    {
+    }
+
+    /// Overrides the internal iteration index
+    CUTLASS_HOST_DEVICE
+    void set_iteration_index(int index) { the_predicates.set_iteration_index(index); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool is_residual_tile)
+    {
+        if (is_residual_tile) { the_predicates.set_mask(residual_tile_mask); }
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_HOST_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        if (kAdvanceRank) {
+            pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1]);
+            pointer_ += Shape::kContiguous * tile_offset[0];
+        } else {
+            pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0]);
+            pointer_ += Shape::kStrided * tile_offset[1];
+        }
+    }
+
+    /// Returns a pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get() const
+    {
+        return reinterpret_cast<AccessType*>(pointer_) + the_predicates.iteration_vector_;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast& operator++()
+    {
+        the_predicates.operator++();
+        ++the_predicates.iteration_vector_;
+        if (the_predicates.iteration_vector_ < kAccessesPerVector) { return *this; }
+
+        the_predicates.iteration_vector_ = 0;
+        ++the_predicates.iteration_contiguous_;
+
+        if (the_predicates.iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+            pointer_ += params_.inc_contiguous_;
+            return *this;
+        }
+
+        // Enter here only if (iteration_contiguous_ ==
+        // ThreadMap::Iteration::kContiguous)
+        the_predicates.iteration_contiguous_ = 0;
+        ++the_predicates.iteration_strided_;
+
+        if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+            pointer_ += params_.inc_next_strided_;
+            return *this;
+        }
+
+        // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+        // which means we enter the next tile.
+        the_predicates.iteration_strided_ = 0;
+
+        // advance to next tile
+        pointer_ += params_.inc_next_;
+
+        // now return to start tile - if the iterator is subsequently advanced, this
+        // subtraction as well as the subsequent integer addition are both elided by
+        // the compiler.
+        pointer_ -= params_.inc_advance_;
+
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast operator++(int)
+    {
+        PredicatedTileAccessIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { the_predicates.clear_mask(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { the_predicates.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { the_predicates.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { the_predicates.get_mask(mask); }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() { return the_predicates.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<Shape_,
+                                               Element_,
+                                               layout::AffineRank2ColumnMajor,
+                                               AdvanceRank,
+                                               ThreadMap_,
+                                               AccessType_,
+                                               false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::AffineRank2ColumnMajor;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+    using AccessType = AccessType_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    // Map to the underlying AffineRankN<2> layout
+    using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+        layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+        Element,
+        layout::AffineRankN<2>,
+        (kAdvanceRank == 0 ? 0 : 1),
+        ThreadMap,
+        AccessType>;
+
+    static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileAccessIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        /// Default ctor
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given an AffineRankN<2> tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout)
+            : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying AffineRankN<2> tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        ///< Precomputed parameters object
+        Params const& params,
+        ///< Pointer to start of tensor
+        Pointer pointer,
+        ///< Extent of tensor
+        TensorCoord extent,
+        ///< ID of each participating thread
+        int thread_id,
+        ///< Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.row(), extent.column()),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column()))
+    {
+    }
+
+    /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        Params const& params,  ///< Precomputed parameters object
+        Pointer pointer,       ///< Pointer to start of tensor
+        TensorCoord extent,    ///< Extent of tensor
+        int thread_id          ///< ID of each participating thread
+        )
+        : PredicatedTileAccessIteratorResidualLast(params,
+                                                   pointer,
+                                                   extent,
+                                                   thread_id,
+                                                   make_Coord(0, 0))
+    {
+    }
+
+    /// Overrides the internal iteration index
+    CUTLASS_HOST_DEVICE
+    void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_HOST_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        iterator_.add_tile_offset(make_Coord(tile_offset.row(), tile_offset.column()));
+    }
+
+    /// Returns a pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get() const { return reinterpret_cast<AccessType*>(iterator_.get()); }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast operator++(int)
+    {
+        PredicatedTileAccessIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank-2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<Shape_,
+                                               Element_,
+                                               layout::AffineRank2RowMajor,
+                                               AdvanceRank,
+                                               ThreadMap_,
+                                               AccessType_,
+                                               false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::AffineRank2RowMajor;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+    using AccessType = AccessType_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    // Map to the underlying AffineRankN<2> layout
+    using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+        layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+        Element,
+        layout::AffineRankN<2>,
+        (kAdvanceRank == 0 ? 1 : 0),
+        ThreadMap,
+        AccessType>;
+
+    static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileAccessIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        /// Default ctor
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given an AffineRankN<2> tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout)
+            : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying AffineRankN<2> tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        ///< Precomputed parameters object
+        Params const& params,
+        ///< Pointer to start of tensor
+        Pointer pointer,
+        ///< Extent of tensor
+        TensorCoord extent,
+        ///< ID of each participating thread
+        int thread_id,
+        ///< Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.column(), extent.row()),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row()))
+    {
+    }
+
+    /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        Params const& params,  ///< Precomputed parameters object
+        Pointer pointer,       ///< Pointer to start of tensor
+        TensorCoord extent,    ///< Extent of tensor
+        int thread_id          ///< ID of each participating thread
+        )
+        : PredicatedTileAccessIteratorResidualLast(params,
+                                                   pointer,
+                                                   extent,
+                                                   thread_id,
+                                                   make_Coord(0, 0))
+    {
+    }
+
+    /// Overrides the internal iteration index
+    CUTLASS_HOST_DEVICE
+    void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_HOST_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        iterator_.add_tile_offset(make_Coord(tile_offset.column(), tile_offset.row()));
+    }
+
+    /// Returns a pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get() const { return reinterpret_cast<AccessType*>(iterator_.get()); }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast operator++(int)
+    {
+        PredicatedTileAccessIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// interleaved data. It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          typename AccessType_,
+          int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<Shape_,
+                                               Element_,
+                                               layout::ColumnMajorInterleaved<InterleavedK>,
+                                               AdvanceRank,
+                                               ThreadMap_,
+                                               AccessType_,
+                                               false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    static int const kInterleavedK = InterleavedK;
+    using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+    using AccessType = AccessType_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+        layout::PitchLinearShape<Shape::kRow * kInterleavedK, Shape::kColumn / kInterleavedK>,
+        Element,
+        layout::PitchLinear,
+        (kAdvanceRank == 0 ? 0 : 1),
+        ThreadMap,
+        AccessType>;
+
+    static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileAccessIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {}
+
+        CUTLASS_HOST_DEVICE
+        Params(typename UnderlyingIterator::Params::Base const& base) : params_(base) {}
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying pitch-linear tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        /// Precomputed parameters object
+        Params const& params,
+        /// Pointer to start of tensor
+        Pointer pointer,
+        /// Extent of tensor
+        TensorCoord extent,
+        /// ID of each participating thread
+        int thread_id,
+        /// Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                             extent.column() / kInterleavedK),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.row() * kInterleavedK,
+                                             threadblock_offset.column() / kInterleavedK))
+    {
+    }
+
+    /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        Params const& params,  ///< Precomputed parameters object
+        Pointer pointer,       ///< Pointer to start of tensor
+        TensorCoord extent,    ///< Extent of tensor
+        int thread_id          ///< ID of each participating thread
+        )
+        : PredicatedTileAccessIteratorResidualLast(params,
+                                                   pointer,
+                                                   extent,
+                                                   thread_id,
+                                                   make_Coord(0, 0))
+    {
+    }
+
+    /// Overrides the internal iteration index
+    CUTLASS_HOST_DEVICE
+    void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_HOST_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+    }
+
+    /// Returns a pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get() const { return reinterpret_cast<AccessType*>(iterator_.get()); }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast operator++(int)
+    {
+        PredicatedTileAccessIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// interleaved data.
+//  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          typename AccessType_,
+          int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<Shape_,
+                                               Element_,
+                                               layout::RowMajorInterleaved<InterleavedK>,
+                                               AdvanceRank,
+                                               ThreadMap_,
+                                               AccessType_,
+                                               false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    static int const kInterleavedK = InterleavedK;
+    using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+    using AccessType = AccessType_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+        layout::PitchLinearShape<Shape::kColumn * kInterleavedK, Shape::kRow / kInterleavedK>,
+        Element,
+        layout::PitchLinear,
+        (kAdvanceRank == 0 ? 1 : 0),
+        ThreadMap,
+        AccessType>;
+
+    static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileAccessIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {}
+
+        CUTLASS_HOST_DEVICE
+        Params(typename UnderlyingIterator::Params::Base const& base) : params_(base) {}
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying pitch-linear tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        /// Precomputed parameters object
+        Params const& params,
+        /// Pointer to start of tensor
+        Pointer pointer,
+        /// Extent of tensor
+        TensorCoord extent,
+        /// ID of each participating thread
+        int thread_id,
+        /// Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                             extent.row() / kInterleavedK),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.column() * kInterleavedK,
+                                             threadblock_offset.row() / kInterleavedK))
+    {
+    }
+
+    /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast(
+        Params const& params,  ///< Precomputed parameters object
+        Pointer pointer,       ///< Pointer to start of tensor
+        TensorCoord extent,    ///< Extent of tensor
+        int thread_id          ///< ID of each participating thread
+        )
+        : PredicatedTileAccessIteratorResidualLast(params,
+                                                   pointer,
+                                                   extent,
+                                                   thread_id,
+                                                   make_Coord(0, 0))
+    {
+    }
+
+    /// Overrides the internal iteration index
+    CUTLASS_HOST_DEVICE
+    void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_HOST_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+    }
+
+    /// Returns a pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get() const { return reinterpret_cast<AccessType*>(iterator_.get()); }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileAccessIteratorResidualLast operator++(int)
+    {
+        PredicatedTileAccessIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_atomic.h b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_atomic.h
new file mode 100644
index 000000000000..8d4173f1a6a2
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_atomic.h
@@ -0,0 +1,886 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+#include <cutlass/cutlass.h>
+#include <cutlass/epilogue/threadblock/predicated_tile_iterator.h>
+#include <cutlass/tensor_coord.h>
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <class AccessType, class Enable = void>
+struct atomic_store {};
+
+template <class AccessType>
+struct atomic_store<AccessType,
+                    typename platform::enable_if<
+                        platform::is_same<typename AccessType::Element, half_t>::value>::type> {
+    using Element = typename AccessType::Element;
+    static const int kCount = AccessType::kElements;
+
+    CUTLASS_DEVICE
+    atomic_store(AccessType const& D, void* ptr, bool pred_guard)
+    {
+        static_assert(!(kCount % 2), "kCount must be even");
+        half2* p = reinterpret_cast<half2*>(ptr);
+        uint const* data = reinterpret_cast<uint const*>(&D);
+        asm volatile(
+            "{\n"
+            "  .reg .pred p;\n"
+            "  setp.ne.b32 p, %0, 0;\n"
+            :
+            : "r"((int)pred_guard));
+        for (int i = 0; i < kCount / 2; i++) {
+            asm volatile("  @p red.relaxed.global.add.noftz.f16x2  [%0], %1;\n"
+                         :
+                         : "l"(p + i), "r"(data[i]));
+        }
+        asm volatile("}\n" ::);
+    }
+};
+
+template <class AccessType>
+struct atomic_store<AccessType,
+                    typename platform::enable_if<
+                        platform::is_same<typename AccessType::Element, float>::value>::type> {
+    using Element = typename AccessType::Element;
+    static const int kCount = AccessType::kElements;
+
+    CUTLASS_DEVICE
+    atomic_store(AccessType const& D, void* ptr, bool pred_guard)
+    {
+        Element* p = reinterpret_cast<Element*>(ptr);
+        uint const* data = reinterpret_cast<uint const*>(&D);
+        asm volatile(
+            "{\n"
+            "  .reg .pred p;\n"
+            "  setp.ne.b32 p, %0, 0;\n"
+            :
+            : "r"((int)pred_guard));
+        for (int i = 0; i < kCount; i++) {
+            asm volatile("  @p red.relaxed.global.add.f32  [%0], %1;\n"
+                         :
+                         : "l"(p + i), "r"(data[i]));
+        }
+        asm volatile("}\n" ::);
+    }
+};
+
+template <typename ThreadMap_,  ///< Thread map (conept: OutputTileThreadMap)
+          typename Element_,    ///< Element data type
+          int Rank>
+class PredicatedTileIteratorAffineRankNAtomic {
+public:
+    using ThreadMap = ThreadMap_;
+    using Shape = typename ThreadMap::Shape;
+
+    using Element = Element_;
+
+    using Layout = layout::AffineRankN<Rank>;
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+    static int const kThreads = ThreadMap::kThreads;
+    static int const kIterations = ThreadMap::Count::kTile;
+
+    static_assert(ThreadMap::Iterations::kRow > 0, "ThreadMap::Iterations::kRow must be > 0");
+    static_assert(ThreadMap::Iterations::kGroup > 0, "ThreadMap::Iterations::kGroup must be > 0");
+    static_assert(ThreadMap::Iterations::kCluster > 0,
+                  "ThreadMap::Iterations::kCluster must be > 0");
+    static_assert(ThreadMap::Iterations::kColumn > 0, "ThreadMap::Iterations::kColumn must be > 0");
+    static_assert(!(Layout::kRank % 2),
+                  "Layout rank must be even. This assumes the first half of the "
+                  "modes correspond to the 'row' "
+                  "and the second half of the modes correspond to the 'column'");
+
+    static bool const kBigEndian = false;
+
+    /// Fragment object
+    using Fragment = Array<Element,
+                           ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+                               ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+                               ThreadMap::kElementsPerAccess>;
+
+    /// Memory access size
+    using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+    //
+    // Parameters struct
+    //
+
+    /// Parameters structure
+    struct Params {
+        //
+        // Data members
+        //
+
+        Layout layout;
+
+        /// Stride in units of bytes along M modes
+        Coord<Layout::kRank / 2, typename Layout::LongIndex> stride_m;
+
+        /// Stride in units of bytes along N modes
+        Coord<Layout::kRank / 2, typename Layout::LongIndex> stride_n;
+
+        /// Fast divmod objects divided by tensor extents
+        FastDivmod divmod_m[(Layout::kRank == 2) ? 1 : (Layout::kRank / 2 - 1)];
+
+        /// Fast divmod objects divided by tensor extents
+        FastDivmod divmod_n[(Layout::kRank == 2) ? 1 : (Layout::kRank / 2 - 1)];
+
+        int64_t rank2_inc_col;
+        int64_t rank2_inc_row;
+
+        //
+        // Methods
+        //
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        CUTLASS_HOST_DEVICE
+        Params(TensorCoord const& extent, Layout const& layout_) : layout(layout_)
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < Layout::kRank / 2; ++i) {
+                stride_m[i] = OffsetBytes<Element>(layout_.stride()[i]);
+                stride_n[i] = OffsetBytes<Element>(layout_.stride()[i + Layout::kRank / 2]);
+            }
+
+            if (kBigEndian) {
+                // "Big Endian" scheme
+                CUTLASS_PRAGMA_UNROLL
+                for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
+                    divmod_m[i] = FastDivmod(extent[i + 1]);
+                    divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2 + 1]);
+                }
+            } else {
+                // "Little Endian" scheme
+                CUTLASS_PRAGMA_UNROLL
+                for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
+                    divmod_m[i] = FastDivmod(extent[i]);
+                    divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2]);
+                }
+            }
+        }
+
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout_) : layout(layout_)
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < Layout::kRank / 2; ++i) {
+                stride_m[i] = OffsetBytes<Element>(layout_.stride()[i]);
+                stride_n[i] = OffsetBytes<Element>(layout_.stride()[i + Layout::kRank / 2]);
+            }
+
+            rank2_inc_col = ThreadMap::Delta::kColumn * stride_n[0];
+            rank2_inc_row = ThreadMap::Delta::kRow * stride_m[0];
+        }
+    };
+
+    /// Mask object
+    struct Mask {
+        static int const kCount = ThreadMap::Iterations::kColumn;
+
+        /// Predicate state
+        bool predicates[kCount];
+
+        //
+        // Mask
+        //
+        CUTLASS_HOST_DEVICE
+        Mask() { enable(); }
+
+        ///< Efficiently disables all accesses guarded by mask
+        CUTLASS_HOST_DEVICE void clear()
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < kCount; ++i) { predicates[i] = false; }
+        }
+
+        ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+        CUTLASS_DEVICE void enable()
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < kCount; ++i) { predicates[i] = true; }
+        }
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Parameters structure containing reference and precomputed state.
+    Params params_;
+
+    /// Byte-level pointer
+    uint8_t* byte_pointer_;
+
+    /// Array of boolean values to contain steady-state predicates
+    Mask mask_;
+
+    /// Extent of the matrix tile in rows
+    Index extent_row_;
+
+    /// Extent of the matrix tile in columns
+    Index extent_col_;
+
+    /// A thread's starting row position (assuming steady-state predicates have
+    /// been computed)
+    Index thread_start_row_;
+
+    /// A thread's starting column position (assuming steady-state predicates have
+    /// been computed)
+    Index thread_start_column_;
+
+    /// Internal state counter
+    int state_[3];
+
+    /// Offsets in columns, cached for performance
+    int64_t offset_modes_n_[ThreadMap::Iterations::kColumn];
+
+    //
+    // Static asserts about internal strides
+    //
+
+    static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+    static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+
+private:
+    //
+    // Methods
+    //
+
+public:
+    //
+    // Methods
+    //
+
+    /// Constructor
+    CUTLASS_DEVICE
+    PredicatedTileIteratorAffineRankNAtomic(
+        Params const& params,
+        Element* pointer,
+        MatrixCoord extent,
+        int thread_idx,
+        MatrixCoord threadblock_offset = MatrixCoord(),
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : params_(params)
+    {
+        MatrixCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+        extent_row_ = extent.row();
+        extent_col_ = extent.column();
+
+        thread_start_row_ = thread_offset.row();
+        thread_start_column_ = thread_offset.column();
+
+        if (Layout::kRank > 2) {
+            // Initialize predicates
+            CUTLASS_PRAGMA_UNROLL
+            for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+                //
+                // Compute coordinate and decompose into N modes
+                //
+
+                int coord_n = thread_start_column_ + c * ThreadMap::Delta::kColumn;
+
+                mask_.predicates[c] = coord_n < extent.column();
+
+                Coord<Layout::kRank / 2, Index> modes_n;
+
+                int64_t offset_modes_n = 0;
+
+                if (kBigEndian) {
+                    modes_n = CoordinateDecomposition<Layout::kRank / 2>(coord_n, params_.divmod_n);
+
+                    offset_modes_n = dot(modes_n, params_.stride_n);
+                } else {
+                    modes_n = CoordinateDecompositionLittleEndian<Layout::kRank / 2>(
+                        coord_n, params_.divmod_n);
+
+                    offset_modes_n = dot(modes_n, params_.stride_n);
+                }
+
+                offset_modes_n_[c] = offset_modes_n;
+            }
+
+            if (!pointer) { mask_.clear(); }
+        }
+
+        // Initialize pointer
+        byte_pointer_ = reinterpret_cast<uint8_t*>(pointer);
+
+        // Initialize internal state counter
+        state_[0] = state_[1] = state_[2] = 0;
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+    }
+
+    /// Stores a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, int64_t byte_offset)
+    {
+        uint8_t* byte_pointer = byte_pointer_;
+        AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+                int row_begin = thread_start_row_ + group * ThreadMap::Delta::kGroup +
+                                cluster * ThreadMap::Delta::kCluster;
+                int64_t offset_modes_m = row_begin * params_.stride_m[0];
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+                    int frag_row_idx =
+                        (row + ThreadMap::Iterations::kRow *
+                                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+                    //
+                    // Compute coordinate and decompose into M modes
+                    //
+
+                    int coord_m = row * ThreadMap::Delta::kRow + row_begin;
+
+                    Coord<Layout::kRank / 2, Index> modes_m;
+
+                    if (Layout::kRank > 2) {
+                        if (kBigEndian) {
+                            modes_m = CoordinateDecomposition<Layout::kRank / 2>(coord_m,
+                                                                                 params_.divmod_m);
+                        } else {
+                            modes_m = CoordinateDecompositionLittleEndian<Layout::kRank / 2>(
+                                coord_m, params_.divmod_m);
+                        }
+
+                        offset_modes_m = dot(modes_m, params_.stride_m);
+                    }
+
+                    //
+                    // Compute the offset due to modes M
+                    //
+
+                    bool row_guard = (coord_m < extent_row_);
+                    int64_t offset_modes_n = thread_start_column_ * params_.stride_n[0];
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+                        //
+                        // Compute coordinate and decompose into N modes
+                        //
+
+                        if (Layout::kRank > 2) { offset_modes_n = offset_modes_n_[column]; }
+
+                        //
+                        // Compute the pointer and access
+                        //
+                        bool guard;
+                        if (Layout::kRank > 2) {
+                            guard = row_guard && mask_.predicates[column];
+                        } else {
+                            guard = (coord_m < extent_row_) &&
+                                    ((thread_start_column_ + ThreadMap::Delta::kColumn * column) <
+                                     extent_col_);
+                        }
+
+                        atomic_store<AccessType>(
+                            frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                            (void*)(byte_pointer + offset_modes_m + offset_modes_n + byte_offset),
+                            guard);
+
+                        if (Layout::kRank == 2) { offset_modes_n += params_.rank2_inc_col; }
+                    }
+
+                    if (Layout::kRank == 2) { offset_modes_m += params_.rank2_inc_row; }
+                }
+            }
+        }
+    }
+
+    /// Stores a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_byte_offset(frag, 0); }
+
+    CUTLASS_DEVICE
+    void load(Fragment& frag) {}
+
+    /// Advances to the next position to load or store
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorAffineRankNAtomic& operator++()
+    {
+        ++state_[0];
+        thread_start_row_ += ThreadMap::Shape::kRow;
+
+        if (state_[0] == ThreadMap::Count::kRow) {
+            state_[0] = 0;
+            ++state_[1];
+
+            thread_start_row_ +=
+                (ThreadMap::Shape::kGroup - 1) * ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+            if (state_[1] == ThreadMap::Count::kGroup) {
+                state_[1] = 0;
+                ++state_[2];
+
+                thread_start_row_ += ThreadMap::Count::kGroup * ThreadMap::Shape::kGroup *
+                                     ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+                if (state_[2] == ThreadMap::Count::kCluster) { state_[2] = 0; }
+            }
+        }
+
+        return *this;
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_DEVICE void clear_mask() { mask_.clear(); }
+
+    ///< Efficiently enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable_mask() { mask_.enable(); }
+
+    ///< Sets the mask
+    CUTLASS_DEVICE void get_mask(Mask& mask) { mask = mask_; }
+
+    ///< Sets the mask
+    CUTLASS_DEVICE void set_mask(Mask const& mask) { mask_ = mask; }
+};
+
+template <typename ThreadMap_,    ///< Thread map (conept: OutputTileThreadMap)
+          typename Element_,      ///< Element data type
+          bool ScatterD = false,  ///< Scatter D operand or not
+          typename PermuteDLayout = layout::NoPermute,  ///< Permute D operand or not
+          bool UseCUDAStore = false>
+class PredicatedTileIteratorAtomic {
+public:
+    using ThreadMap = ThreadMap_;
+    using Shape = typename ThreadMap::Shape;
+
+    using Element = Element_;
+
+    using Layout = layout::RowMajor;
+    using TensorRef = TensorRef<Element, Layout>;
+    using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+    using TensorCoord = MatrixCoord;
+
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+    static int const kThreads = ThreadMap::kThreads;
+    static int const kIterations = ThreadMap::Count::kTile;
+
+    static bool constexpr PermuteD = !layout::is_trivial_permute<PermuteDLayout>;
+
+    static_assert(ThreadMap::Iterations::kRow > 0, "ThreadMap::Iterations::kRow must be > 0");
+    static_assert(ThreadMap::Iterations::kGroup > 0, "ThreadMap::Iterations::kGroup must be > 0");
+    static_assert(ThreadMap::Iterations::kCluster > 0,
+                  "ThreadMap::Iterations::kCluster must be > 0");
+    static_assert(ThreadMap::Iterations::kColumn > 0, "ThreadMap::Iterations::kColumn must be > 0");
+
+    /// Fragment object
+    using Fragment = Array<Element,
+                           ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+                               ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+                               ThreadMap::kElementsPerAccess>;
+
+    /// Memory access size
+    using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+    //
+    // Parameters struct
+    //
+
+    /// Uses a non-template class
+    struct Params : PredicatedTileIteratorParams {
+        using Base = PredicatedTileIteratorParams;
+
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout)
+            : PredicatedTileIteratorParams(
+                  layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+                  make_OutputTileThreadMapDesc<ThreadMap>())
+        {
+        }
+
+        CUTLASS_HOST_DEVICE
+        Params(Base const& base) : Base(base) {}
+    };
+
+    /// Mask object
+    struct Mask {
+        static int const kCount = ThreadMap::Iterations::kColumn;
+
+        /// Predicate state
+        bool predicates[kCount];
+
+        //
+        // Mask
+        //
+        CUTLASS_HOST_DEVICE
+        Mask() { enable(); }
+
+        ///< Efficiently disables all accesses guarded by mask
+        CUTLASS_HOST_DEVICE void clear()
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < kCount; ++i) { predicates[i] = false; }
+        }
+
+        ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+        CUTLASS_DEVICE void enable()
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < kCount; ++i) { predicates[i] = true; }
+        }
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Parameters structure containing reference and precomputed state.
+    PredicatedTileIteratorParams params_;
+
+    /// Byte-level pointer. This pointer is usually for both load() and store(),
+    /// unless PermuteD is performed. When having PermuteD, byte_pointer_ is only
+    /// for load().
+    uint8_t* byte_pointer_;
+
+    /// Byte-level pointer for store(). Due to PermuteD Op, store_byte_pointer_
+    /// may be with different address computation compared to byte_pointer_.
+    uint8_t* store_byte_pointer_;
+
+    /// Array of boolean values to contain steady-state predicates
+    Mask mask_;
+
+    /// Extent of the matrix tile in rows
+    Index extent_row_;
+
+    /// Extent of the matrix tile in rows
+    Index extent_column_;
+
+    /// A thread's starting row position (assuming steady-state predicates have
+    /// been computed)
+    Index thread_start_row_;
+
+    /// A thread's starting column
+    Index thread_start_column_;
+
+    /// Internal state counter
+    int state_[3];
+
+    /// Scatter indices
+    int const* indices_;
+
+    /// PermuteDLayout
+    PermuteDLayout permute_layout_;
+
+    //
+    // Static asserts about internal strides
+    //
+
+    static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+    static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+    static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
+
+private:
+    //
+    // Methods
+    //
+
+public:
+    //
+    // Methods
+    //
+
+    /// Constructor
+    CUTLASS_DEVICE
+    PredicatedTileIteratorAtomic(PredicatedTileIteratorParams const& params,
+                                 Element* pointer,
+                                 TensorCoord extent,
+                                 int thread_idx,
+                                 TensorCoord threadblock_offset = TensorCoord(),
+                                 int const* indices = nullptr)
+        : params_(params),
+          indices_(indices),
+          permute_layout_(PitchLinearCoord(extent.column(), extent.row()),
+                          params_.stride * kElementsPerAccess / sizeof(AccessType))
+    {
+        TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+        extent_row_ = extent.row();
+        extent_column_ = extent.column();
+
+        thread_start_row_ = thread_offset.row();
+        thread_start_column_ = thread_offset.column();
+
+        // Initialize predicates
+        CUTLASS_PRAGMA_UNROLL
+        for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+            mask_.predicates[c] =
+                ((thread_offset.column() + ThreadMap::Delta::kColumn * c) < extent.column());
+        }
+
+        // Null pointer performs no accesses
+        if (!pointer) { mask_.clear(); }
+
+        if (ScatterD && !indices) { mask_.clear(); }
+
+        // Initialize byte_pointer_
+        byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+                        LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
+                        LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+
+        if (ScatterD) {
+            byte_pointer_ =
+                reinterpret_cast<uint8_t*>(pointer) +
+                LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+        }
+
+        // store_byte_pointer_ is set to be the same with byte_pointer_ unless
+        // PermuteD is used.
+        store_byte_pointer_ = PermuteD ? reinterpret_cast<uint8_t*>(pointer) : byte_pointer_;
+
+        // Initialize internal state counter
+        state_[0] = state_[1] = state_[2] = 0;
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        store_byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+        byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+    }
+
+    /// Stores a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, int64_t byte_offset) const
+    {
+        uint8_t* byte_pointer = store_byte_pointer_;
+        AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+                    int frag_row_idx =
+                        (row + ThreadMap::Iterations::kRow *
+                                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+                    int row_offset = row * ThreadMap::Delta::kRow +
+                                     group * ThreadMap::Delta::kGroup +
+                                     cluster * ThreadMap::Delta::kCluster;
+
+                    bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+                    AccessType* memory_pointer =
+                        reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+                    if (ScatterD && row_guard) {
+                        assert(indices_);
+
+                        memory_pointer = reinterpret_cast<AccessType*>(
+                            byte_pointer + byte_offset +
+                            LongIndex(indices_[row_offset + thread_start_row_]) *
+                                LongIndex(params_.stride));
+                    }
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+                        bool guard = row_guard && mask_.predicates[column];
+
+                        if (PermuteD) {
+                            int col_offset = column * ThreadMap::Delta::kColumn;
+
+                            int col = col_offset + thread_start_column_;
+                            int row = row_offset + thread_start_row_;
+
+                            // Locate memory_pointer
+                            memory_pointer = reinterpret_cast<AccessType*>(
+                                byte_pointer + byte_offset +
+                                permute_layout_(PitchLinearCoord(col, row)) * sizeof(AccessType) /
+                                    kElementsPerAccess);
+                        }
+                        atomic_store<AccessType>(
+                            frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                            (void*)&memory_pointer[0],
+                            guard);
+
+                        if (!PermuteD) {
+                            memory_pointer += (ThreadMap::Delta::kColumn / kElementsPerAccess);
+                        }
+                    }
+
+                    if (row + 1 < ThreadMap::Iterations::kRow) {
+                        if (!ScatterD && !PermuteD) { byte_pointer += params_.increment_row; }
+                    }
+                }
+
+                if (group + 1 < ThreadMap::Iterations::kGroup) {
+                    byte_pointer += params_.increment_group;
+                }
+            }
+
+            if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+                byte_pointer += params_.increment_cluster;
+            }
+        }
+    }
+
+    /// Stores a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) const { store_with_byte_offset(frag, 0); }
+
+    CUTLASS_DEVICE
+    void load(Fragment& frag) {}
+
+    CUTLASS_DEVICE
+    MatrixCoord thread_start() const
+    {
+        return MatrixCoord(thread_start_row_, thread_start_column_);
+    }
+
+    /// Need to get the thread start row from the tile iterator
+    CUTLASS_DEVICE
+    int32_t thread_start_row() const { return thread_start_row_; }
+
+    /// Need to get the thread start row from the tile iterator
+    CUTLASS_DEVICE
+    int32_t thread_start_column() const { return thread_start_column_; }
+
+    /// Extent of the matrix in rows
+    CUTLASS_DEVICE
+    Index extent_row() const { return extent_row_; }
+
+    /// Extent of the matrix in columns
+    CUTLASS_DEVICE
+    Index extent_column() const { return extent_column_; }
+
+    /// Advances to the next position to load or store
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorAtomic& operator++()
+    {
+        ++state_[0];
+
+        if (!ScatterD && !PermuteD) { store_byte_pointer_ += params_.advance_row; }
+
+        if (!ScatterD) { byte_pointer_ += params_.advance_row; }
+
+        thread_start_row_ += ThreadMap::Shape::kRow;
+
+        if (state_[0] == ThreadMap::Count::kRow) {
+            state_[0] = 0;
+            ++state_[1];
+            byte_pointer_ += params_.advance_group;
+            store_byte_pointer_ += params_.advance_group;
+
+            thread_start_row_ +=
+                (ThreadMap::Shape::kGroup - 1) * ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+            if (state_[1] == ThreadMap::Count::kGroup) {
+                state_[1] = 0;
+                ++state_[2];
+                byte_pointer_ += params_.advance_cluster;
+                store_byte_pointer_ += params_.advance_cluster;
+
+                thread_start_row_ += ThreadMap::Count::kGroup * ThreadMap::Shape::kGroup *
+                                     ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+                if (state_[2] == ThreadMap::Count::kCluster) {
+                    state_[2] = 0;
+                    byte_pointer_ += params_.advance_tile;
+                    store_byte_pointer_ += params_.advance_tile;
+
+                    thread_start_row_ += ThreadMap::Shape::kGroup * ThreadMap::Shape::kRow *
+                                         ThreadMap::Shape::kCluster * ThreadMap::Shape::kTile;
+                }
+            }
+        }
+
+        return *this;
+    }
+
+    /// Advances a number of positions to load or store
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorAtomic& operator+=(int increment)
+    {
+        // Row
+        state_[0] += increment;
+        int increment_row = state_[0] / ThreadMap::Count::kRow;
+        state_[0] = state_[0] % ThreadMap::Count::kRow;
+
+        byte_pointer_ += (params_.advance_row * increment);
+        store_byte_pointer_ += (params_.advance_row * increment);
+        thread_start_row_ += (ThreadMap::Shape::kRow * increment);
+
+        // Group
+        state_[1] += increment_row;
+        int increment_group = state_[1] / ThreadMap::Count::kGroup;
+        state_[1] = state_[1] % ThreadMap::Count::kGroup;
+
+        byte_pointer_ += (params_.advance_group * increment_row);
+        store_byte_pointer_ += (params_.advance_group * increment_row);
+        thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * ThreadMap::Shape::kRow *
+                             ThreadMap::Count::kRow * increment_row;
+
+        // Cluster
+        state_[2] += increment_group;
+        int increment_cluster = state_[2] / ThreadMap::Count::kCluster;
+        state_[2] = state_[2] % ThreadMap::Count::kCluster;
+
+        byte_pointer_ += (params_.advance_cluster * increment_group);
+        store_byte_pointer_ += (params_.advance_cluster * increment_group);
+        thread_start_row_ += ThreadMap::Count::kGroup * ThreadMap::Shape::kGroup *
+                             ThreadMap::Count::kRow * ThreadMap::Shape::kRow * increment_group;
+
+        // Tile
+        byte_pointer_ += (params_.advance_tile * increment_cluster);
+        store_byte_pointer_ += (params_.advance_tile * increment_cluster);
+        thread_start_row_ += ThreadMap::Shape::kGroup * ThreadMap::Shape::kRow *
+                             ThreadMap::Shape::kCluster * ThreadMap::Shape::kTile *
+                             increment_cluster;
+
+        return *this;
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_DEVICE void clear_mask() { mask_.clear(); }
+
+    ///< Efficiently enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable_mask() { mask_.enable(); }
+
+    ///< Sets the mask
+    CUTLASS_DEVICE void get_mask(Mask& mask) const { mask = mask_; }
+
+    ///< Sets the mask
+    CUTLASS_DEVICE void set_mask(Mask const& mask) { mask_ = mask; }
+};
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_residual_last.h b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_residual_last.h
new file mode 100644
index 000000000000..629047dbb057
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_residual_last.h
@@ -0,0 +1,1938 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2
+   tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile
+   this iterator visits maybe partial, then the remaining tiles are complete.
+   So, we only need to compute the predicates twice, once before the first tile
+   and once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileIteratorResidualLast
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize
+/// register liveness and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params"
+/// object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is
+/// constructed. Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator
+/// is constructed. Subsequent additions to logical coordinate offset may be
+/// performed but are relatively expensive.
+///
+/// Visitation order is intended to first visit a "residual" tile that may be
+/// partially full in both the advance dimension and the steady-state dimension.
+/// This is assumed to be the last tile in the iteration sequence. Advancing an
+/// iterator that has just been constructed moves to the first tile that is full
+/// in the advance dimension and recomputes predicates. Subsequent accesses may
+/// be performed without updating internal predicates and are efficient in terms
+/// of live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced
+/// at least once outside any looping structure to minimize integer arithmetic.
+///
+/// Accesses out of bounds are safe so long as `clear_mask()` is called prior to
+/// dereferencing the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params,
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update
+//   internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks -
+//       subsequent loads become NO-OPs.
+//     }
+//
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to
+//     steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator =
+//   transform::threadblock::PredicatedTileIteratorResidualLast;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <typename Shape,
+          typename Element,
+          typename Layout,
+          int AdvanceRank,
+          typename ThreadMap,
+          int AccessSize = ThreadMap::kElementsPerAccess,
+          bool Gather = false>
+class PredicatedTileIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          int AccessSize,
+          bool Gather>
+class PredicatedTileIteratorResidualLast<Shape_,
+                                         Element_,
+                                         layout::PitchLinear,
+                                         AdvanceRank,
+                                         ThreadMap_,
+                                         AccessSize,
+                                         Gather> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::PitchLinear;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    /// Type used for internal memory accesses
+    using AccessType =
+        AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+    /// Underlying iterator to compute the addresses
+    using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<Shape,
+                                                                        Element,
+                                                                        Layout,
+                                                                        kAdvanceRank,
+                                                                        ThreadMap,
+                                                                        AccessType,
+                                                                        Gather>;
+
+    static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+    /// Fragment object to be loaded or stored
+    using Fragment =
+        cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename TileAccessIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    public:
+        using Base = typename TileAccessIterator::Params::Base;
+
+        friend PredicatedTileIteratorResidualLast;
+
+    private:
+        /// Parameters object
+        typename TileAccessIterator::Params params_;
+
+    public:
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout) {}
+
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        CUTLASS_HOST_DEVICE
+        Params(Base const& base) : params_(base) {}
+    };
+
+private:
+    /// Internal pointer type permits fast address arithmetic
+    using BytePointer = char*;
+
+private:
+    //
+    // Data members
+    //
+
+    /// Data member to the tile access iterator
+    TileAccessIterator address_iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(
+        /// Precomputed parameters object
+        Params const& params,
+        /// Pointer to start of tensor
+        Pointer pointer,
+        /// Extent of tensor
+        TensorCoord extent,
+        /// ID of each participating thread
+        int thread_id,
+        /// Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        /// Gather indices
+        int const* indices = nullptr)
+        : address_iterator_(params.params_, pointer, extent, thread_id, threadblock_offset, indices)
+    {
+    }
+
+    /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(Params const& params,  ///< Precomputed parameters object
+                                       Pointer pointer,       ///< Pointer to start of tensor
+                                       TensorCoord extent,    ///< Extent of tensor
+                                       int thread_id          ///< ID of each participating thread
+                                       )
+        : PredicatedTileIteratorResidualLast(params, pointer, extent, thread_id, make_Coord(0, 0))
+    {
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        address_iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast& operator++()
+    {
+        if (kAdvanceRank)
+            address_iterator_.add_tile_offset({0, 1});
+        else
+            address_iterator_.add_tile_offset({1, 0});
+
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast operator++(int)
+    {
+        PredicatedTileIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { address_iterator_.set_residual_tile(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { address_iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { address_iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { address_iterator_.get_mask(mask); }
+
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset)
+    {
+        load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+    }
+
+    CUTLASS_DEVICE
+    void load_with_byte_offset(Fragment& frag, LongIndex byte_offset)
+    {
+        AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < kAccessesPerVector; ++v) {
+                    int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+                    address_iterator_.set_iteration_index(idx);
+                    char const* byte_ptr =
+                        reinterpret_cast<char const*>(address_iterator_.get()) + byte_offset;
+
+                    AccessType const* access_ptr = reinterpret_cast<AccessType const*>(byte_ptr);
+
+                    cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                        frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+                    ++address_iterator_;
+                }
+            }
+        }
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) { load_with_byte_offset(frag, 0); }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_pointer_offset(Fragment const& frag, Index pointer_offset)
+    {
+        store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset)
+    {
+        address_iterator_.set_iteration_index(0);
+        AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < kAccessesPerVector; ++v) {
+                    int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+                    char* byte_ptr = reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+                    AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+                    if (address_iterator_.valid()) { *access_ptr = frag_ptr[idx]; }
+                    ++address_iterator_;
+                }
+            }
+        }
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_byte_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          int AccessSize,
+          bool Gather>
+class PredicatedTileIteratorResidualLast<Shape_,
+                                         Element_,
+                                         layout::ColumnMajor,
+                                         AdvanceRank,
+                                         ThreadMap_,
+                                         AccessSize,
+                                         Gather> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::ColumnMajor;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingIterator =
+        PredicatedTileIteratorResidualLast<layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+                                           Element,
+                                           layout::PitchLinear,
+                                           (kAdvanceRank == 0 ? 0 : 1),
+                                           ThreadMap,
+                                           AccessSize,
+                                           Gather>;
+
+    using AccessType = typename UnderlyingIterator::AccessType;
+
+    /// Fragment object to be loaded or stored
+    using Fragment =
+        cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {}
+
+        CUTLASS_HOST_DEVICE
+        Params(typename UnderlyingIterator::Params::Base const& base) : params_(base) {}
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying pitch-linear tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(
+        Params const& params,                   ///< Precomputed parameters object
+        Pointer pointer,                        ///< Pointer to start of tensor
+        TensorCoord extent,                     ///< Extent of tensor
+        int thread_id,                          ///< ID of each participating thread
+        TensorCoord const& threadblock_offset,  ///< Initial offset of threadblock
+        int const* indices = nullptr            ///< gather/scatter indices, note no support for
+                                                ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.row(), extent.column()),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column()),
+                    indices)
+    {
+    }
+
+    /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(Params const& params,  ///< Precomputed parameters object
+                                       Pointer pointer,       ///< Pointer to start of tensor
+                                       TensorCoord extent,    ///< Extent of tensor
+                                       int thread_id          ///< ID of each participating thread
+                                       )
+        : PredicatedTileIteratorResidualLast(params, pointer, extent, thread_id, make_Coord(0, 0))
+    {
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast operator++(int)
+    {
+        PredicatedTileIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset)
+    {
+        iterator_.load_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_byte_offset(Fragment& frag, LongIndex byte_offset)
+    {
+        iterator_.load_with_byte_offset(frag, byte_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) { load_with_pointer_offset(frag, 0); }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_pointer_offset(Fragment const& frag, Index pointer_offset)
+    {
+        iterator_.store_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset)
+    {
+        iterator_.store_with_byte_offset(frag, byte_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          int AccessSize,
+          bool Gather>
+class PredicatedTileIteratorResidualLast<Shape_,
+                                         Element_,
+                                         layout::RowMajor,
+                                         AdvanceRank,
+                                         ThreadMap_,
+                                         AccessSize,
+                                         Gather> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::RowMajor;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingIterator =
+        PredicatedTileIteratorResidualLast<layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+                                           Element,
+                                           layout::PitchLinear,
+                                           (kAdvanceRank == 0 ? 1 : 0),
+                                           ThreadMap,
+                                           AccessSize,
+                                           Gather>;
+
+    using AccessType = typename UnderlyingIterator::AccessType;
+
+    /// Fragment object to be loaded or stored
+    using Fragment =
+        cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {}
+
+        CUTLASS_HOST_DEVICE
+        Params(typename UnderlyingIterator::Params::Base const& base) : params_(base) {}
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying pitch-linear tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(
+        Params const& params,                   ///< Precomputed parameters object
+        Pointer pointer,                        ///< Pointer to start of tensor
+        TensorCoord extent,                     ///< Extent of tensor
+        int thread_id,                          ///< ID of each participating thread
+        TensorCoord const& threadblock_offset,  ///< Initial offset of threadblock
+        int const* indices = nullptr            ///< Gather indices
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.column(), extent.row()),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row()),
+                    indices)
+    {
+    }
+
+    /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(Params const& params,  ///< Precomputed parameters object
+                                       Pointer pointer,       ///< Pointer to start of tensor
+                                       TensorCoord extent,    ///< Extent of tensor
+                                       int thread_id          ///< ID of each participating thread
+                                       )
+        : PredicatedTileIteratorResidualLast(params, pointer, extent, thread_id, make_Coord(0, 0))
+    {
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast operator++(int)
+    {
+        PredicatedTileIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset)
+    {
+        iterator_.load_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_byte_offset(Fragment& frag, LongIndex byte_offset)
+    {
+        iterator_.load_with_byte_offset(frag, byte_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) { load_with_pointer_offset(frag, 0); }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_pointer_offset(Fragment const& frag, Index pointer_offset)
+    {
+        iterator_.store_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset)
+    {
+        iterator_.store_with_byte_offset(frag, byte_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank-2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int AccessSize>
+class PredicatedTileIteratorResidualLast<Shape_,
+                                         Element_,
+                                         layout::AffineRankN<2>,
+                                         AdvanceRank,
+                                         ThreadMap_,
+                                         AccessSize,
+                                         false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::AffineRankN<2>;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    /// Type used for internal memory accesses
+    using AccessType =
+        AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+    /// Underlying iterator to compute the addresses
+    using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<Shape,
+                                                                        Element,
+                                                                        Layout,
+                                                                        kAdvanceRank,
+                                                                        ThreadMap,
+                                                                        AccessType>;
+
+    static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+    /// Fragment object to be loaded or stored
+    using Fragment =
+        cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename TileAccessIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    public:
+        friend PredicatedTileIteratorResidualLast;
+
+    private:
+        /// Parameters object
+        typename TileAccessIterator::Params params_;
+
+    public:
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout) {}
+
+        CUTLASS_HOST_DEVICE
+        Params() {}
+    };
+
+private:
+    /// Internal pointer type permits fast address arithmetic
+    using BytePointer = char*;
+
+private:
+    //
+    // Data members
+    //
+
+    /// Data member to the tile access iterator
+    TileAccessIterator address_iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(
+        /// Precomputed parameters object
+        Params const& params,
+        /// Pointer to start of tensor
+        Pointer pointer,
+        /// Extent of tensor
+        TensorCoord extent,
+        /// ID of each participating thread
+        int thread_id,
+        /// Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : address_iterator_(params.params_, pointer, extent, thread_id, threadblock_offset)
+    {
+    }
+
+    /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(Params const& params,  ///< Precomputed parameters object
+                                       Pointer pointer,       ///< Pointer to start of tensor
+                                       TensorCoord extent,    ///< Extent of tensor
+                                       int thread_id          ///< ID of each participating thread
+                                       )
+        : PredicatedTileIteratorResidualLast(params, pointer, extent, thread_id, make_Coord(0, 0))
+    {
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        address_iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast& operator++()
+    {
+        if (kAdvanceRank)
+            address_iterator_.add_tile_offset(make_Coord(0, 1));
+        else
+            address_iterator_.add_tile_offset(make_Coord(1, 0));
+
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast operator++(int)
+    {
+        PredicatedTileIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { address_iterator_.set_residual_tile(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { address_iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { address_iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { address_iterator_.get_mask(mask); }
+
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset)
+    {
+        load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+    }
+
+    CUTLASS_DEVICE
+    void load_with_byte_offset(Fragment& frag, LongIndex byte_offset)
+    {
+        AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < kAccessesPerVector; ++v) {
+                    int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+                    address_iterator_.set_iteration_index(idx);
+                    char const* byte_ptr =
+                        reinterpret_cast<char const*>(address_iterator_.get()) + byte_offset;
+
+                    AccessType const* access_ptr = reinterpret_cast<AccessType const*>(byte_ptr);
+
+                    cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                        frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+                    ++address_iterator_;
+                }
+            }
+        }
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) { load_with_byte_offset(frag, 0); }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_pointer_offset(Fragment const& frag, Index pointer_offset)
+    {
+        store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset)
+    {
+        address_iterator_.set_iteration_index(0);
+        AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < kAccessesPerVector; ++v) {
+                    int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+                    char* byte_ptr = reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+                    AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+                    if (address_iterator_.valid()) { *access_ptr = frag_ptr[idx]; }
+                    ++address_iterator_;
+                }
+            }
+        }
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_byte_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int AccessSize>
+class PredicatedTileIteratorResidualLast<Shape_,
+                                         Element_,
+                                         layout::AffineRank2ColumnMajor,
+                                         AdvanceRank,
+                                         ThreadMap_,
+                                         AccessSize,
+                                         false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::AffineRank2ColumnMajor;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    // Map to the underlying AffineRankN<2> layout
+    using UnderlyingIterator =
+        PredicatedTileIteratorResidualLast<layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+                                           Element,
+                                           layout::AffineRankN<2>,
+                                           (kAdvanceRank == 0 ? 0 : 1),
+                                           ThreadMap,
+                                           AccessSize>;
+
+    using AccessType = typename UnderlyingIterator::AccessType;
+
+    /// Fragment object to be loaded or stored
+    using Fragment =
+        cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given an AffineRankN<2> tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout)
+            : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1)))
+        {
+        }
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying AffineRankN<2> tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(
+        Params const& params,                   ///< Precomputed parameters object
+        Pointer pointer,                        ///< Pointer to start of tensor
+        TensorCoord extent,                     ///< Extent of tensor
+        int thread_id,                          ///< ID of each participating thread
+        TensorCoord const& threadblock_offset,  ///< Initial offset of threadblock
+        int const* indices = nullptr            ///< gather/scatter indices, note no support for
+                                                ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.row(), extent.column()),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column()))
+    {
+    }
+
+    /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(Params const& params,  ///< Precomputed parameters object
+                                       Pointer pointer,       ///< Pointer to start of tensor
+                                       TensorCoord extent,    ///< Extent of tensor
+                                       int thread_id          ///< ID of each participating thread
+                                       )
+        : PredicatedTileIteratorResidualLast(params, pointer, extent, thread_id, make_Coord(0, 0))
+    {
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast operator++(int)
+    {
+        PredicatedTileIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset)
+    {
+        iterator_.load_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_byte_offset(Fragment& frag, LongIndex byte_offset)
+    {
+        iterator_.load_with_byte_offset(frag, byte_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) { load_with_pointer_offset(frag, 0); }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_pointer_offset(Fragment const& frag, Index pointer_offset)
+    {
+        iterator_.store_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset)
+    {
+        iterator_.store_with_byte_offset(frag, byte_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int AccessSize>
+class PredicatedTileIteratorResidualLast<Shape_,
+                                         Element_,
+                                         layout::AffineRank2RowMajor,
+                                         AdvanceRank,
+                                         ThreadMap_,
+                                         AccessSize,
+                                         false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::AffineRank2RowMajor;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    // Map to the underlying AffineRankN<2> layout
+    using UnderlyingIterator =
+        PredicatedTileIteratorResidualLast<layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+                                           Element,
+                                           layout::AffineRankN<2>,
+                                           (kAdvanceRank == 0 ? 1 : 0),
+                                           ThreadMap,
+                                           AccessSize>;
+
+    using AccessType = typename UnderlyingIterator::AccessType;
+
+    /// Fragment object to be loaded or stored
+    using Fragment =
+        cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given an AffineRankN<2> tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout)
+            : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0)))
+        {
+        }
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying AffineRankN<2> tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(
+        Params const& params,                   ///< Precomputed parameters object
+        Pointer pointer,                        ///< Pointer to start of tensor
+        TensorCoord extent,                     ///< Extent of tensor
+        int thread_id,                          ///< ID of each participating thread
+        TensorCoord const& threadblock_offset,  ///< Initial offset of threadblock
+        int const* indices = nullptr            ///< gather/scatter indices, note no support for
+                                                ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.column(), extent.row()),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row()))
+    {
+    }
+
+    /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(Params const& params,  ///< Precomputed parameters object
+                                       Pointer pointer,       ///< Pointer to start of tensor
+                                       TensorCoord extent,    ///< Extent of tensor
+                                       int thread_id          ///< ID of each participating thread
+                                       )
+        : PredicatedTileIteratorResidualLast(params, pointer, extent, thread_id, make_Coord(0, 0))
+    {
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast operator++(int)
+    {
+        PredicatedTileIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset)
+    {
+        iterator_.load_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_byte_offset(Fragment& frag, LongIndex byte_offset)
+    {
+        iterator_.load_with_byte_offset(frag, byte_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) { load_with_pointer_offset(frag, 0); }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_pointer_offset(Fragment const& frag, Index pointer_offset)
+    {
+        iterator_.store_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset)
+    {
+        iterator_.store_with_byte_offset(frag, byte_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          int AccessSize,
+          int InterleavedK>
+class PredicatedTileIteratorResidualLast<Shape_,
+                                         Element_,
+                                         layout::ColumnMajorInterleaved<InterleavedK>,
+                                         AdvanceRank,
+                                         ThreadMap_,
+                                         AccessSize,
+                                         false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    static int const kInterleavedK = InterleavedK;
+    using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+        layout::PitchLinearShape<Shape::kRow * kInterleavedK, Shape::kColumn / kInterleavedK>,
+        Element,
+        layout::PitchLinear,
+        (kAdvanceRank == 0 ? 0 : 1),
+        ThreadMap,
+        AccessSize>;
+
+    using AccessType = typename UnderlyingIterator::AccessType;
+
+    /// Fragment object to be loaded or stored
+    using Fragment =
+        cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {}
+
+        CUTLASS_HOST_DEVICE
+        Params(typename UnderlyingIterator::Params::Base const& base) : params_(base) {}
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying pitch-linear tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(
+        /// Precomputed parameters object
+        Params const& params,
+        /// Pointer to start of tensor
+        Pointer pointer,
+        /// Extent of tensor
+        TensorCoord extent,
+        /// ID of each participating thread
+        int thread_id,
+        /// Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                             extent.column() / kInterleavedK),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.row() * kInterleavedK,
+                                             threadblock_offset.column() / kInterleavedK))
+    {
+    }
+
+    /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(Params const& params,  ///< Precomputed parameters object
+                                       Pointer pointer,       ///< Pointer to start of tensor
+                                       TensorCoord extent,    ///< Extent of tensor
+                                       int thread_id          ///< ID of each participating thread
+                                       )
+        : PredicatedTileIteratorResidualLast(params, pointer, extent, thread_id, make_Coord(0, 0))
+    {
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast operator++(int)
+    {
+        PredicatedTileIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset)
+    {
+        iterator_.load_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) { load_with_pointer_offset(frag, 0); }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_pointer_offset(Fragment const& frag, Index pointer_offset)
+    {
+        iterator_.store_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved-32
+/// data.  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_,
+          typename Element_,
+          int AdvanceRank,
+          typename ThreadMap_,
+          int AccessSize,
+          int InterleavedK>
+class PredicatedTileIteratorResidualLast<Shape_,
+                                         Element_,
+                                         layout::RowMajorInterleaved<InterleavedK>,
+                                         AdvanceRank,
+                                         ThreadMap_,
+                                         AccessSize,
+                                         false> {
+public:
+    static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+                  "Specialization for pitch-linear iterator may along advance along the "
+                  "contiguous(rank=0) or strided(rank=1) dimension.");
+
+    using Shape = Shape_;
+    using Element = Element_;
+    static int const kInterleavedK = InterleavedK;
+    using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+    static int const kAdvanceRank = AdvanceRank;
+    using ThreadMap = ThreadMap_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+        layout::PitchLinearShape<Shape::kColumn * kInterleavedK, Shape::kRow / kInterleavedK>,
+        Element,
+        layout::PitchLinear,
+        (kAdvanceRank == 0 ? 1 : 0),
+        ThreadMap,
+        AccessSize>;
+
+    using AccessType = typename UnderlyingIterator::AccessType;
+
+    /// Fragment object to be loaded or stored
+    using Fragment =
+        cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+    /// Predicate vector stores mask to guard accesses
+    using Mask = typename UnderlyingIterator::Mask;
+
+    /// Parameters object is precomputed state and is host-constructible
+    class Params {
+    private:
+        friend PredicatedTileIteratorResidualLast;
+
+        /// Parameters object
+        typename UnderlyingIterator::Params params_;
+
+    public:
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {}
+
+        CUTLASS_HOST_DEVICE
+        Params(typename UnderlyingIterator::Params::Base const& base) : params_(base) {}
+    };
+
+private:
+    //
+    // Data members
+    //
+
+    /// Underlying pitch-linear tile iterator
+    UnderlyingIterator iterator_;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(
+        /// Precomputed parameters object
+        Params const& params,
+        /// Pointer to start of tensor
+        Pointer pointer,
+        /// Extent of tensor
+        TensorCoord extent,
+        /// ID of each participating thread
+        int thread_id,
+        /// Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        int const* indices = nullptr  ///< gather/scatter indices, note no support for
+                                      ///< gather/scatter at this specialization
+        )
+        : iterator_(params.params_,
+                    pointer,
+                    layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                             extent.row() / kInterleavedK),
+                    thread_id,
+                    layout::PitchLinearCoord(threadblock_offset.column() * kInterleavedK,
+                                             threadblock_offset.row() / kInterleavedK))
+    {
+    }
+
+    /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+    /// offset
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast(Params const& params,  ///< Precomputed parameters object
+                                       Pointer pointer,       ///< Pointer to start of tensor
+                                       TensorCoord extent,    ///< Extent of tensor
+                                       int thread_id          ///< ID of each participating thread
+                                       )
+        : PredicatedTileIteratorResidualLast(params, pointer, extent, thread_id, make_Coord(0, 0))
+    {
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        iterator_.add_pointer_offset(pointer_offset);
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast& operator++()
+    {
+        ++iterator_;
+        return *this;
+    }
+
+    /// Advances to the next tile in memory.
+    ///
+    /// The first time this method is called, predicates are updated, and the
+    /// iterator's internal pointer is reverted to the first "steady state" tile.
+    /// Subsequent calls are lightweight and must only update the internal
+    /// pointer.
+    CUTLASS_HOST_DEVICE
+    PredicatedTileIteratorResidualLast operator++(int)
+    {
+        PredicatedTileIteratorResidualLast self(*this);
+        operator++();
+        return self;
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+    CUTLASS_HOST_DEVICE
+    void set_residual_tile(bool enable) { iterator_.set_residual_tile(enable); }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE
+    void enable_mask() { iterator_.enable_mask(); }
+
+    /// Sets the predicate mask, overriding value stored in predicate iterator
+    CUTLASS_HOST_DEVICE
+    void set_mask(Mask const& mask) { iterator_.set_mask(mask); }
+
+    /// Gets the mask
+    CUTLASS_HOST_DEVICE
+    void get_mask(Mask& mask) { iterator_.get_mask(mask); }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset)
+    {
+        iterator_.load_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load(Fragment& frag) { load_with_pointer_offset(frag, 0); }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store_with_pointer_offset(Fragment const& frag, Index pointer_offset)
+    {
+        iterator_.store_with_pointer_offset(frag, pointer_offset);
+    }
+
+    /// Store a fragment to memory
+    CUTLASS_DEVICE
+    void store(Fragment const& frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/transpose_warp_iterator.h b/csrc/deepspeed4science/evoformer_attn/iterators/transpose_warp_iterator.h
new file mode 100644
index 000000000000..2435c07f8989
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/transpose_warp_iterator.h
@@ -0,0 +1,57 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include "warp_iterator_from_smem.h"
+
+template <typename WarpIterator>
+struct TransposeWarpIterator {
+    using Iterator = char;
+    static bool constexpr kSupportsTranspose = false;
+};
+
+template <
+    /// Operand identity
+    cutlass::gemm::Operand Operand,
+    /// Data type of A elements
+    typename Element,
+    bool kTranspose>
+struct TransposeWarpIterator<
+    cutlass::gemm::warp::WarpIteratorFromSmem<Operand, Element, kTranspose>> {
+    using Iterator = cutlass::gemm::warp::WarpIteratorFromSmem<Operand, Element, !kTranspose>;
+    static bool constexpr kSupportsTranspose = true;
+};
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/warp_iterator_from_smem.h b/csrc/deepspeed4science/evoformer_attn/iterators/warp_iterator_from_smem.h
new file mode 100644
index 000000000000..7dd59832b4b0
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/warp_iterator_from_smem.h
@@ -0,0 +1,269 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*! \file
+    \brief Inspired from
+   "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h" Loads tiles of GEMM
+   operands from a RowMajor shared-memory layout into registers to use by A100
+   TensorCores.
+
+    The difference with "mma_tensor_op_tile_access_iterator.h" is that:
+    (1) We use "ldmatrix" to load tiles, rather than manual loads (slightly
+   faster) (2) We support to transpose the operand (eg read `A.transpose()` when
+   the shared memory holds `A`)
+
+    This is only implemented for the specific shapes.
+*/
+#pragma once
+
+#include <cutlass/gemm/gemm.h>
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+template <
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    bool kTranspose = false>
+class WarpIteratorFromSmem {
+public:
+    /// Shape of tile to load (concept: MatrixShape)
+    using Shape = cutlass::MatrixShape<32, 32>;
+
+    /// Operand tag
+    static Operand const kOperand = Operand_;
+
+    /// Basic check
+    static_assert(
+        kOperand == Operand::kA || kOperand == Operand::kB,
+        "WarpIteratorFromSmem may only be instantiated for A or B operands to warp-level Mma.");
+
+    /// Element type
+    using Element = Element_;
+    static_assert(sizeof_bits<Element>::value == 16, "Only supported for half");
+
+    /// Layout of source tile
+    using Layout = cutlass::layout::RowMajor;
+
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    using InstructionShape = cutlass::MatrixShape<16, 8>;
+
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    static int const kOpDelta = 1;
+
+    /// Number of participating threads
+    static int const kThreads = 32;
+
+    /// TensorRef type for loading element from a tensor
+    using TensorRef = TensorRef<Element, Layout>;
+
+    /// Index type
+    using Index = typename TensorRef::Index;
+
+    /// Long Index type
+    using LongIndex = typename TensorRef::LongIndex;
+
+    /// Coordinate for an element in the tensor
+    using TensorCoord = typename TensorRef::TensorCoord;
+
+    /// Number of elements accessed per Shared Memory load
+    static int const kElementsPerAccess =
+        (sizeof_bits<Element>::value >= 32 ? 1 : 32 / sizeof_bits<Element>::value);
+
+    using InstructionCount = MatrixShape<Shape::kRow / InstructionShape::kRow,
+                                         Shape::kColumn / InstructionShape::kColumn>;
+
+    static int const kIterations = (kOperand == Operand::kA) ? InstructionCount::kColumn
+                                                             : InstructionCount::kRow;
+
+public:
+    //
+    // Derived quantities
+    //
+
+    /// Fragment object holding a thread's part of a tile
+    using Fragment =
+        Array<Element,
+              (kOperand == Operand::kA) ? (Shape::kRow* InstructionShape::kColumn / kThreads)
+                                        : (Shape::kColumn* InstructionShape::kRow / kThreads)>;
+
+    /// Memory access type
+    // using AccessType = AlignedArray<Element, kElementsPerAccess>;
+    using AccessType = Array<unsigned, 4>;
+
+    static int constexpr kWarpShapeDivisibleInner =
+        (kOperand == Operand::kA ? InstructionShape::kColumn : InstructionShape::kRow);
+    static int constexpr kAccessesInner = (kWarpShapeDivisibleInner / kElementsPerAccess) / 4;
+    static int const kTilesPerInstruction = InstructionShape::kRow / 8;
+
+private:
+    /// Underlying tensor reference
+    TensorRef ref_;
+
+    /// Origin
+    MatrixCoord origin_;
+
+    /// Iterations in a tile
+    int iterations_;
+
+public:
+    /// Constructor from TensorRef
+    CUTLASS_HOST_DEVICE
+    WarpIteratorFromSmem(TensorRef const& ref, int lane_id)
+        : WarpIteratorFromSmem(ref, {Shape::kRow, Shape::kColumn}, lane_id)
+    {
+    }
+    CUTLASS_HOST_DEVICE
+    WarpIteratorFromSmem(TensorRef const& ref, TensorCoord extent, int lane_id)
+        : ref_(ref), iterations_(0)
+    {
+        int ldsm_vec_num = (lane_id >> 3);
+        if (kOperand == Operand::kA) {
+            origin_ = MatrixCoord(lane_id % 8, 0);
+            static_assert(InstructionCount::kRow * kAccessesInner * kTilesPerInstruction == 4, "");
+            CUTLASS_PRAGMA_UNROLL
+            for (int inst_m_idx = 0; inst_m_idx < InstructionCount::kRow; ++inst_m_idx) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int access_m_idx = 0; access_m_idx < kTilesPerInstruction;
+                         ++access_m_idx) {
+                        int access_idx =
+                            access_m_idx +
+                            kTilesPerInstruction * (inner_idx + kAccessesInner * inst_m_idx);
+
+                        MatrixCoord offset(access_m_idx * 8 + inst_m_idx * InstructionShape::kRow,
+                                           inner_idx * 4 * kElementsPerAccess);
+
+                        if (access_idx == ldsm_vec_num) {
+                            if (kTranspose) { offset = MatrixCoord(offset.column(), offset.row()); }
+                            origin_ += offset;
+                        }
+                    }
+                }
+            }
+        } else {
+            origin_ = MatrixCoord(0, lane_id % 8);
+            static_assert(InstructionCount::kColumn * kAccessesInner == 4, "");
+            CUTLASS_PRAGMA_UNROLL
+            for (int inst_n_idx = 0; inst_n_idx < InstructionCount::kColumn; ++inst_n_idx) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+                    int access_idx = inner_idx + kAccessesInner * inst_n_idx;
+
+                    MatrixCoord offset(inner_idx * 4 * kElementsPerAccess, inst_n_idx * 8);
+
+                    if (access_idx == ldsm_vec_num) {
+                        if (kTranspose) { offset = MatrixCoord(offset.column(), offset.row()); }
+                        origin_ += offset;
+                    }
+                }
+            }
+        }
+
+        ref_.add_coord_offset(origin_);
+    }
+
+    /// Advances an iterator along logical dimensions of matrix in units of whole
+    /// tiles
+    CUTLASS_HOST_DEVICE
+    WarpIteratorFromSmem& add_tile_offset(TensorCoord const& tile_offset)
+    {
+        TensorCoord coord_offset(tile_offset.row() * Shape::kRow,
+                                 tile_offset.column() * Shape::kColumn);
+        if (kTranspose) { coord_offset = TensorCoord{coord_offset.column(), coord_offset.row()}; }
+        origin_ += coord_offset;
+
+        ref_.add_coord_offset(coord_offset);
+
+        return *this;
+    }
+
+    /// Advances the iterator along the advance dimension
+    CUTLASS_DEVICE
+    void advance()
+    {
+        if (kOperand == Operand::kA) {
+            add_tile_offset({0, 1});
+        } else {
+            add_tile_offset({1, 0});
+        }
+
+        iterations_ = 0;
+    }
+
+    /// increase iterations in a tile
+    CUTLASS_HOST_DEVICE
+    WarpIteratorFromSmem& operator++()
+    {
+        iterations_++;
+
+        if (iterations_ >= kIterations) advance();
+
+        return *this;
+    }
+
+    /// Loads a fragment from memory at the location pointed to by the iterator.
+    CUTLASS_DEVICE
+    void load(Fragment& frag) const
+    {
+        AccessType* access_ptr = reinterpret_cast<AccessType*>(&frag);
+        using LoadLayout =
+            typename platform::conditional<kTranspose, layout::ColumnMajor, layout::RowMajor>::type;
+
+        MatrixCoord offset;
+        if (kOperand == Operand::kA) {
+            offset = MatrixCoord(0, iterations_ * InstructionShape::kColumn);
+        } else {
+            offset = MatrixCoord(iterations_ * InstructionShape::kRow, 0);
+        }
+        if (kTranspose) { offset = MatrixCoord(offset.column(), offset.row()); }
+        cutlass::arch::ldsm<LoadLayout, 4>(access_ptr[0], ref_.data() + ref_.offset(offset));
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace warp
+}  // namespace gemm
+}  // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/deepspeed4science/evoformer_attn/kernel_backward.h b/csrc/deepspeed4science/evoformer_attn/kernel_backward.h
new file mode 100644
index 000000000000..87e6df18bb04
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/kernel_backward.h
@@ -0,0 +1,1965 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <cmath>
+#include <type_traits>
+#include <vector>
+
+#include <cuda_fp16.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/functional.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "gemm_kernel_utils.h"
+
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+#include "epilogue/epilogue_pipelined.h"
+#include "iterators/epilogue_predicated_tile_iterator.h"
+
+#include "epilogue/epilogue_grad_bias.h"
+#include "gemm/custom_mma.h"
+#include "gemm/find_default_mma.h"
+#include "gemm/mma_accum_lambda_iterator.h"
+#include "gemm/mma_from_smem.h"
+#include "transform/bias_broadcast.h"
+#include "transform/tile_smem_loader.h"
+
+#include <inttypes.h>
+
+using namespace gemm_kernel_utils;
+
+namespace {
+
+template <typename FragmentType, int32_t kNumThreads>
+struct GmemTile {
+    /*
+      Helper functions to efficient store/load RF to gmem
+
+      GEMM accumulators have a particular format on A100, and
+      it takes some compute/shared-memory to rearrange them to
+      a RowMajor or ColumnMajor format in global memory through
+      an Epilogue. The same complexity goes for loading into RF.
+
+      This class loads/stores RF as they are, and can be used for
+      efficient accumulation across gemms for instance:
+
+      ```
+      GmemTile tile;
+      for (int i = 0; i < N; ++i) {
+        // ...
+
+        Fragment accum;
+        if (i == 0) {
+          accum.clear();
+        } else {
+          tile.load(accum);
+        }
+        mma(accum, ...);
+        if (i < N-1) {
+          // Store for next GEMM
+          tile.store(accum);
+        } else {
+          // Store in tensor (eg RowMajor)
+          epilogue(accum);
+        }
+
+        // ...
+      }
+      ```
+    */
+
+    // 128bits per thread
+    using AccessType = cutlass::Array<float, 4>;
+    static constexpr int32_t kBytes = sizeof(AccessType);
+    static constexpr int32_t kStride = kNumThreads * AccessType::kElements;
+    static constexpr int32_t kNumIters = FragmentType::kElements / AccessType::kElements;
+    static constexpr int32_t kElementsStored = kNumThreads * FragmentType::kElements;
+    static_assert(FragmentType::kElements % AccessType::kElements == 0,
+                  "fragment not aligned on 128 bits");
+
+    float* ptr;
+
+    CUTLASS_DEVICE void load(FragmentType& fragment, int thread_id)
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kNumIters; ++i) {
+            AccessType* __restrict__ gmem_ptr = reinterpret_cast<AccessType*>(
+                ptr + thread_id * AccessType::kElements + i * kStride);
+            AccessType sub_fragment;
+            cutlass::arch::global_load<AccessType, kBytes>(sub_fragment, gmem_ptr, true);
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < AccessType::kElements; ++j) {
+                fragment[i * AccessType::kElements + j] = sub_fragment[j];
+            }
+        }
+    }
+
+    CUTLASS_DEVICE void store(FragmentType const& fragment, int thread_id)
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kNumIters; ++i) {
+            AccessType* __restrict__ gmem_ptr = reinterpret_cast<AccessType*>(
+                ptr + thread_id * AccessType::kElements + i * kStride);
+            AccessType sub_fragment;
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < AccessType::kElements; ++j) {
+                sub_fragment[j] = fragment[i * AccessType::kElements + j];
+            }
+            cutlass::arch::global_store<AccessType, kBytes>(sub_fragment, gmem_ptr, true);
+        }
+    }
+};
+
+template <typename scalar_t, typename Arch>
+constexpr int getWarpsPerSm()
+{
+    constexpr bool is_half = !cutlass::platform::is_same<scalar_t, float>::value;
+    if (Arch::kMinComputeCapability >= 80) { return is_half ? 12 : 8; }
+    return 8;
+}
+}  // namespace
+
+template <
+    // which arch we target (eg `cutlass::arch::Sm80`)
+    typename ArchTag_,
+    // input/output type
+    typename scalar_t_,
+    // run optimized kernel because memory accesses will be aligned
+    bool kIsAligned_,
+    // use dropout if enabled
+    bool kApplyDropout_,
+    // when doing a GEMM, preload the next one (uses more shmem)
+    bool kPreload_,
+    // block dimensions
+    int kBlockSizeI_,
+    int kBlockSizeJ_,
+    // upperbound on `max(value.shape[-1], query.shape[-1])`
+    int kMaxK_ = (int)cutlass::platform::numeric_limits<uint32_t>::max(),
+    template <typename, typename, typename> class Broadcast1_ = BroadcastNoLoad,
+    template <typename, typename, typename> class Broadcast2_ = BroadcastNoLoad>
+struct AttentionBackwardKernel {
+    using scalar_t = scalar_t_;
+    using output_t = scalar_t;
+    using output_accum_t = float;
+    using lse_scalar_t = float;
+    using accum_t = float;
+    using ArchTag = ArchTag_;
+    static constexpr bool kIsAligned = kIsAligned_;
+    static constexpr bool kApplyDropout = kApplyDropout_;
+    static constexpr bool kPreload = kPreload_;
+    static constexpr int kBlockSizeI = kBlockSizeI_;
+    static constexpr int kBlockSizeJ = kBlockSizeJ_;
+    static constexpr int kMaxK = kMaxK_;
+
+    struct Params {
+        // Input tensors
+        scalar_t* query_ptr;          // [Mq, nH, K]
+        scalar_t* key_ptr;            // [Mk, nH, K]
+        scalar_t* value_ptr;          // [Mk, nH, Kv]
+        lse_scalar_t* logsumexp_ptr;  // [nH, Mq]
+        scalar_t* output_ptr;         // [Mq, nH, Kv]
+        scalar_t* grad_output_ptr;    // [Mq, nH, Kv]
+        accum_t* delta_ptr;           // [nH, Mq]
+        int32_t* cu_seqlens_q_ptr = nullptr;
+        int32_t* cu_seqlens_k_ptr = nullptr;
+
+        // Output tensors
+        output_t* grad_query_ptr;  //  [Mq, nH, K]
+        output_t* grad_key_ptr;    //    [Mk, nH, K]
+        output_t* grad_value_ptr;  //  [Mk, nH, Kv]
+
+        accum_t* grad_bias1_ptr = nullptr;
+        accum_t* grad_bias2_ptr = nullptr;
+        int32_t B = 0;
+        int32_t N = 0;
+        scalar_t* bias1_ptr = nullptr;
+        scalar_t* bias2_ptr = nullptr;
+
+        // Accumulators
+        union {
+            output_accum_t* workspace = nullptr;  // [Mq, Kq] + [Mkv, Kq] + [Mkv, Kv]
+            output_accum_t* workspace_gk;
+        };
+        output_accum_t* workspace_gv;  // (will be calculated by the kernel)
+        output_accum_t* workspace_gq;  // (will be calculated by the kernel)
+
+        // Scale
+        accum_t scale;
+
+        // Dimensions/strides
+        int32_t head_dim = -1;
+        int32_t head_dim_value = -1;
+        int32_t num_queries = -1;
+        int32_t num_keys = -1;
+        int32_t num_heads = -1;
+
+        int32_t q_strideM;
+        int32_t k_strideM;
+        int32_t v_strideM;
+        int32_t gO_strideM;
+        int32_t gB_strideM;
+        int8_t gQKV_strideM_multiplier = 1;  // 3 for packed, 1 otherwise
+
+        // RNG sequence offset based on batch_id and head_id
+        unsigned long long dropout_batch_head_rng_offset;
+        float dropout_prob = 0.0f;
+
+        CUTLASS_HOST_DEVICE int32_t o_strideM() const { return head_dim_value * num_heads; }
+        CUTLASS_HOST_DEVICE int32_t gQ_strideM() const
+        {
+            return gQKV_strideM_multiplier * num_heads * head_dim;
+        }
+        CUTLASS_HOST_DEVICE int32_t gK_strideM() const
+        {
+            return gQKV_strideM_multiplier * num_heads * head_dim;
+        }
+        CUTLASS_HOST_DEVICE int32_t gV_strideM() const
+        {
+            return gQKV_strideM_multiplier * num_heads * head_dim_value;
+        }
+
+        // Everything below is only used in `advance_to_block`
+        // and shouldn't use registers
+        int64_t o_strideH;
+        int32_t q_strideH;
+        int32_t k_strideH;
+        int32_t v_strideH;
+        int64_t o_strideB;
+        int64_t q_strideB;
+        int64_t k_strideB;
+        int64_t v_strideB;
+        int64_t lse_strideB;
+        int64_t lse_strideH;
+        int64_t delta_strideB;
+        int64_t delta_strideH;
+        int32_t num_batches;
+
+        int64_t gO_strideB = 0;
+        int64_t gQ_strideB = 0;
+        int64_t gK_strideB = 0;
+        int64_t gV_strideB = 0;
+        int64_t gB_strideB = 0;
+        int64_t gO_strideH = 0;
+        int64_t gQ_strideH = 0;
+        int64_t gK_strideH = 0;
+        int64_t gV_strideH = 0;
+        int64_t gB_strideH = 0;
+
+        CUTLASS_DEVICE bool advance_to_block()
+        {
+            int64_t batch_id = blockIdx.z;
+            int32_t head_id = blockIdx.y;
+
+            if (kNeedsAccumGradQ || kNeedsAccumGradK || kNeedsAccumGradV) {
+                assert(workspace_size() == 0 || workspace != nullptr);
+
+                workspace += (batch_id * num_heads + head_id) * workspace_strideBH();
+                workspace = warp_uniform(workspace);
+                workspace_gv = workspace + workspace_elements_gk();
+                workspace_gq = workspace_gv + workspace_elements_gv();
+            } else {
+                workspace = nullptr;
+            }
+
+            // Advance pointers that depend on the total concatenated
+            // number of queries, as `num_queries` is modified in the block
+            // below
+            dropout_batch_head_rng_offset = batch_id * (num_heads * num_queries * num_keys) +
+                                            head_id * (num_queries * num_keys);
+            logsumexp_ptr += batch_id * lse_strideB + head_id * lse_strideH;
+
+            query_ptr += batch_id * q_strideB + head_id * q_strideH;
+            key_ptr += batch_id * k_strideB + head_id * k_strideH;
+            value_ptr += batch_id * v_strideB + head_id * v_strideH;
+            output_ptr += batch_id * o_strideB + head_id * o_strideH;
+            grad_output_ptr += batch_id * gO_strideB + head_id * gO_strideH;
+            delta_ptr += batch_id * delta_strideB + head_id * delta_strideH;
+
+            grad_query_ptr += batch_id * gQ_strideB + head_id * gQ_strideH;
+            grad_key_ptr += batch_id * gK_strideB + head_id * gK_strideH;
+            grad_value_ptr += batch_id * gV_strideB + head_id * gV_strideH;
+            using broadcast_1 = Broadcast1_<typename MatmulQK::BiasLoader::ThreadMap,
+                                            typename MatmulQK::BiasLoader::Shape,
+                                            scalar_t>;
+            using broadcast_2 = Broadcast2_<typename MatmulQK::BiasLoader::ThreadMap,
+                                            typename MatmulQK::BiasLoader::Shape,
+                                            scalar_t>;
+
+            if (broadcast_1::kEnable && grad_bias1_ptr) {
+                grad_bias1_ptr += batch_id * num_queries;
+            }
+            if (broadcast_2::kEnable && grad_bias2_ptr) {
+                auto strideB = num_heads * num_queries * num_keys;
+                auto strideH = num_queries * num_keys;
+                grad_bias2_ptr += (batch_id / N) * strideB + head_id * strideH;
+            }
+            if (broadcast_1::kEnable && bias1_ptr) {
+                bias1_ptr = broadcast_1::advance(bias1_ptr,
+                                                 batch_id / N,
+                                                 batch_id % N,
+                                                 head_id,
+                                                 num_queries * N,
+                                                 num_queries,
+                                                 0);
+            }
+            if (broadcast_2::kEnable && bias2_ptr) {
+                auto strideB = num_heads * num_queries * num_keys;
+                auto strideH = num_queries * num_keys;
+                bias2_ptr = broadcast_2::advance(
+                    bias2_ptr, batch_id / N, batch_id % N, head_id, strideB, 0, strideH);
+            }
+
+            num_queries = warp_uniform(num_queries);
+            num_keys = warp_uniform(num_keys);
+
+            query_ptr = warp_uniform(query_ptr);
+            key_ptr = warp_uniform(key_ptr);
+            value_ptr = warp_uniform(value_ptr);
+            logsumexp_ptr = warp_uniform(logsumexp_ptr);
+            output_ptr = warp_uniform(output_ptr);
+            grad_output_ptr = warp_uniform(grad_output_ptr);
+            delta_ptr = warp_uniform(delta_ptr);
+
+            grad_query_ptr = warp_uniform(grad_query_ptr);
+            grad_key_ptr = warp_uniform(grad_key_ptr);
+            grad_value_ptr = warp_uniform(grad_value_ptr);
+            if (broadcast_1::kEnable) {
+                grad_bias1_ptr = warp_uniform(grad_bias1_ptr);
+                bias1_ptr = warp_uniform(bias1_ptr);
+            }
+            if (broadcast_2::kEnable) {
+                grad_bias2_ptr = warp_uniform(grad_bias2_ptr);
+                bias2_ptr = warp_uniform(bias2_ptr);
+            }
+
+            return true;
+        }
+
+        __host__ dim3 getBlocksGrid() const { return dim3(1, num_heads, num_batches); }
+        __host__ dim3 getThreadsGrid() const { return dim3(kWarpSize * kNumWarpsPerBlock, 1, 1); }
+        CUTLASS_HOST_DEVICE int64_t workspace_elements_gk() const
+        {
+            if (!kNeedsAccumGradK) { return 0; }
+            return align_up(num_keys, (int32_t)kBlockSizeJ) *
+                   align_up(head_dim, (int32_t)kBlockSizeI);
+        }
+        CUTLASS_HOST_DEVICE int64_t workspace_elements_gv() const
+        {
+            if (!kNeedsAccumGradV) { return 0; }
+            return align_up(num_keys, (int32_t)kBlockSizeJ) *
+                   align_up(head_dim_value, (int32_t)kBlockSizeI);
+        }
+        CUTLASS_HOST_DEVICE int64_t workspace_elements_gq() const
+        {
+            if (!kNeedsAccumGradQ) { return 0; }
+            if (num_keys <= kBlockSizeJ) { return 0; }
+            return align_up(num_queries, (int32_t)kBlockSizeI) *
+                   align_up(head_dim, (int32_t)kBlockSizeJ);
+        }
+        CUTLASS_HOST_DEVICE int64_t workspace_strideBH() const
+        {
+            // Aligned on 128bits
+            return align_up(
+                workspace_elements_gk() + workspace_elements_gv() + workspace_elements_gq(),
+                int64_t(4));
+        }
+        CUTLASS_HOST_DEVICE int64_t workspace_size() const
+        {
+            // Returns size of buffer we need to run this kernel
+            return num_batches * num_heads * workspace_strideBH() * sizeof(float);
+        }
+    };
+
+    static constexpr int64_t kWarpSize = 32;
+
+    // If this is true, we store and accumulate dK/dV in RF
+    // rather than going back to gmem every time
+    static constexpr bool kIsHalf = cutlass::sizeof_bits<scalar_t>::value <= 16;
+    static constexpr bool kOutputInRF = kIsHalf && kMaxK <= kBlockSizeI;
+    static_assert(!kPreload || (kIsHalf && ArchTag::kMinComputeCapability >= 80 && kOutputInRF),
+                  "preload MMA not supported");
+    static constexpr bool kPrologueQK = kPreload;
+    static constexpr bool kPrologueGV = kPreload;
+    static constexpr bool kPrologueDOV = kPreload;
+    static constexpr bool kPrologueGQ = kPreload;
+    static constexpr bool kPrologueGK = kPreload;
+
+    static constexpr int64_t kNumWarpsPerBlock = (kBlockSizeI * kBlockSizeJ) / (32 * 32);
+
+    // Compute delta for the f16 kernels
+    // TODO: Figure out why it's slower on the f32 kernels
+    // (something due to RF pressure?)
+    // TODO: Remove condition on `kOutputInRF` - this is needed to work
+    // around a compiler bug on V100, not exactly sure why but I spent
+    // too much time on this already. Reproducible with
+    // (B, Mq, Mkv, K) = (1, 1, 1, 136) for instance
+    static constexpr bool kKernelComputesDelta =
+        kIsHalf && (kOutputInRF || ArchTag::kMinComputeCapability != 70);
+
+    static constexpr bool kNeedsAccumGradQ =
+        !cutlass::platform::is_same<output_accum_t, output_t>::value;
+    static constexpr bool kNeedsAccumGradK =
+        !kOutputInRF && !cutlass::platform::is_same<output_accum_t, output_t>::value;
+    static constexpr bool kNeedsAccumGradV =
+        !kOutputInRF && !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+    // Launch bounds
+    static constexpr int64_t kNumThreads = kWarpSize * kNumWarpsPerBlock;
+    static constexpr int64_t kMinBlocksPerSm =
+        getWarpsPerSm<scalar_t, ArchTag>() / kNumWarpsPerBlock;
+
+    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<typename GemmType::OpClass,
+                                                                 ArchTag,
+                                                                 scalar_t,
+                                                                 scalar_t,
+                                                                 scalar_t,  // ElementC
+                                                                 accum_t    // ElementAccumulator
+                                                                 >;
+    static constexpr auto kOptimalAlignement =
+        cutlass::platform::max(DefaultConfig::kAlignmentA, DefaultConfig::kAlignmentB);
+    static constexpr auto kMinimumAlignment = GemmType::kMinimumAlignment;
+
+    struct MatmulQK {
+        /*
+        attn_T = k_j @ q_i.transpose(-2, -1) # matmul
+        attn_T = (attn_T - logsumexp[i_start:i_end].unsqueeze(1).transpose(-2,
+        -1)).exp() # epilogue
+
+        with attn_T.shape = (kBlockSizeJ, kBlockSizeI)
+        */
+        using ThreadblockShape =
+            cutlass::gemm::GemmShape<kBlockSizeJ, kBlockSizeI, GemmType::ThreadK>;
+        using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+        using DefaultMma = typename cutlass::gemm::threadblock::DefaultMma<
+            scalar_t,                   // ElementA
+            cutlass::layout::RowMajor,  // LayoutA
+            kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment,
+            scalar_t,                      // ElementB
+            cutlass::layout::ColumnMajor,  // LayoutB
+            kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+            accum_t,                    // ElementC
+            cutlass::layout::RowMajor,  // LayoutC
+            typename GemmType::OpClass,
+            ArchTag,
+            ThreadblockShape,
+            WarpShape,
+            typename GemmType::InstructionShape,
+            DefaultConfig::kStages,
+            typename GemmType::Operator,
+            false,  // AccumulatorsInRowMajor = false,
+            cutlass::gemm::SharedMemoryClearOption::kNone>;
+        using MmaCore = typename DefaultMma::MmaCore;
+        using Mma = typename MakeCustomMma<typename DefaultMma::ThreadblockMma, kMaxK>::Mma;
+
+        // used for efficient load of bias tile (Bij) from global memory to shared
+        // memory
+        using BiasLoader =
+            TileSmemLoader<scalar_t,
+                           // Bij is applied to transposed attn matrix tile (Pij.T). Bij is loaded
+                           // row-major but needs to have transposed shape so we get the same
+                           // elements.
+                           cutlass::MatrixShape<ThreadblockShape::kN, ThreadblockShape::kM>,
+                           MmaCore::kThreads,
+                           // input restriction: kv_len has to be a multiple of this value
+                           128 / cutlass::sizeof_bits<scalar_t>::value>;
+
+        // Epilogue to store to shared-memory in a format that we can use later for
+        // the second matmul
+        using B2bGemm =
+            typename cutlass::gemm::threadblock::B2bGemm<typename Mma::Operator::IteratorC,
+                                                         typename Mma::Operator,
+                                                         scalar_t,
+                                                         WarpShape,
+                                                         ThreadblockShape>;
+        using AccumLambdaIterator =
+            typename DefaultMmaAccumLambdaIterator<typename Mma::Operator::IteratorC,
+                                                   accum_t,
+                                                   kWarpSize>::Iterator;
+        using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+    };
+
+    struct MatmulGradV {
+        /*
+        grad_v[j_start:j_end] += attn_T @ do_i # matmul
+
+        Dimensions: (kBlockSizeJ * kNumWarpsPerBlock, kBlockSizeI, K)
+        (we might need to iterate multiple times on K)
+        */
+        using ThreadblockShape =
+            cutlass::gemm::GemmShape<kBlockSizeJ, kBlockSizeI, GemmType::ThreadK>;
+        using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+        using InstructionShape = typename GemmType::InstructionShape;
+
+        using DefaultGemm =
+            cutlass::gemm::kernel::DefaultGemm<scalar_t,                   // ElementA,
+                                               cutlass::layout::RowMajor,  // LayoutA,
+                                               DefaultConfig::kAlignmentA,
+                                               scalar_t,                   // ElementB,
+                                               cutlass::layout::RowMajor,  // LayoutB,
+                                               kIsAligned ? DefaultConfig::kAlignmentB
+                                                          : GemmType::kMinimumAlignment,
+                                               output_t,
+                                               cutlass::layout::RowMajor,  // LayoutC,
+                                               accum_t,
+                                               typename GemmType::OpClass,
+                                               ArchTag,
+                                               ThreadblockShape,
+                                               WarpShape,
+                                               typename GemmType::InstructionShape,
+                                               typename DefaultConfig::EpilogueOutputOp,
+                                               void,  // ThreadblockSwizzle - not used
+                                               DefaultConfig::kStages,
+                                               false,  // SplitKSerial
+                                               typename GemmType::Operator>;
+
+        // if dropout:
+        //   for computing dVj += (Pij.T * Zij) @ dOi
+        //   Pij_dropped.T = Pij.T * Zij is computed on the fly as fragments of
+        //   Pij.T are loaded in. The reason we do it this way is because Pij.T and
+        //   Zij are reused in later steps, while Pij_dropped.T is only needed in
+        //   this step. computing Pij_dropped.T on the fly allows us to avoid
+        //   keeping all 3 of Pij_dropped.T, Pij.T, and Zij in shared memory at the
+        //   same time.
+        // if no dropout:
+        //   for computing dVj += Pij.T @ dOi
+        using DefaultMmaFromSmem = typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            typename MatmulQK::AccumulatorSharedStorage,
+            kApplyDropout>;  // kScaleOperandA
+
+        using Mma = typename DefaultMmaFromSmem::Mma;
+        using WarpIteratorA = typename DefaultMmaFromSmem::WarpIteratorA;
+        using IteratorB = typename Mma::IteratorB;
+        using WarpCount = typename Mma::WarpCount;
+
+        // Epilogue
+        using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp;
+        using DefaultEpilogue = typename DefaultGemm::Epilogue;
+        using OutputTileIterator =
+            typename cutlass::epilogue::threadblock::MakePrefetchableIterator<
+                typename DefaultEpilogue::OutputTileIterator>::Iterator;
+        using AccumTileGmem = GmemTile<typename Mma::FragmentC, (int)kNumThreads>;
+    };
+
+    struct MatmulDOIVJ {
+        /*
+        doi_t_vj = do_i @ v_j.transpose(-2, -1) # matmul
+        tmp = (doi_t_vj - Di.unsqueeze(1)) * attn # inplace / epilogue?
+        */
+        using ThreadblockShape =
+            cutlass::gemm::GemmShape<kBlockSizeI, kBlockSizeJ, GemmType::ThreadK>;
+        using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+
+        using ElementC = accum_t;  // CSY: Change it for better accuracy
+        using ElementAccum = accum_t;
+
+        // no-op output op - epilogue just stores result to global memory
+        using BiasGradEpilogueOutputOp = typename cutlass::epilogue::thread::LinearCombination<
+            ElementC,
+            DefaultConfig::EpilogueOutputOp::kCount,
+            typename DefaultConfig::EpilogueOutputOp::ElementAccumulator,
+            typename DefaultConfig::EpilogueOutputOp::ElementCompute,
+            cutlass::epilogue::thread::ScaleType::Nothing>;
+
+        using DefaultGemm = typename cutlass::gemm::kernel::DefaultGemm<
+            scalar_t,                   // ElementA
+            cutlass::layout::RowMajor,  // LayoutA
+            kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment,
+            scalar_t,                      // ElementB
+            cutlass::layout::ColumnMajor,  // LayoutB
+            kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+            ElementC,                   // ElementC
+            cutlass::layout::RowMajor,  // LayoutC
+            ElementAccum,               // ElementAccumulator
+            typename GemmType::OpClass,
+            ArchTag,
+            ThreadblockShape,
+            WarpShape,
+            typename GemmType::InstructionShape,
+            BiasGradEpilogueOutputOp,  // EpilogueOutputOp
+            void,                      // ThreadblockSwizzle (not used)
+            // multiple preloads, dropout Zij tile, and 3 stages push us over shared
+            // memory capacity on A100. set a ceiling on number of stages to save
+            // shared memory if dropout is in use.
+            kPreload && kApplyDropout && (kBlockSizeI * kBlockSizeJ > 64 * 64)
+                ? cutlass::const_min(2, DefaultConfig::kStages)
+                : DefaultConfig::kStages,  // Stages
+            false,                         // SplitKSerial
+            typename GemmType::Operator,
+            cutlass::gemm::SharedMemoryClearOption::kNone>;
+        using Mma = typename MakeCustomMma<typename DefaultGemm::Mma, kMaxK>::Mma;
+
+        // epilogue used to write bias gradient, which is just the output of this
+        // matmul with some operations applied to the fragment
+        using BiasGradEpilogue = typename DefaultGemm::Epilogue;
+
+        // Epilogue to store to shared-memory in a format that we can use later for
+        // the second matmul
+        using B2bGemm =
+            typename cutlass::gemm::threadblock::B2bGemm<typename Mma::Operator::IteratorC,
+                                                         typename Mma::Operator,
+                                                         scalar_t,
+                                                         WarpShape,
+                                                         ThreadblockShape>;
+        using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+    };
+
+    struct MatmulGradQ {
+        // grad_q <- tmp @ k_j
+        using ThreadblockShape =
+            cutlass::gemm::GemmShape<kBlockSizeI, kBlockSizeJ, GemmType::ThreadK>;
+        using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+        using InstructionShape = typename GemmType::InstructionShape;
+
+        using DefaultGemm =
+            cutlass::gemm::kernel::DefaultGemm<scalar_t,                   // ElementA,
+                                               cutlass::layout::RowMajor,  // LayoutA,
+                                               DefaultConfig::kAlignmentA,
+                                               scalar_t,                   // ElementB,
+                                               cutlass::layout::RowMajor,  // LayoutB,
+                                               kIsAligned ? DefaultConfig::kAlignmentB
+                                                          : GemmType::kMinimumAlignment,
+                                               output_t,
+                                               cutlass::layout::RowMajor,  // LayoutC,
+                                               accum_t,
+                                               typename GemmType::OpClass,
+                                               ArchTag,
+                                               ThreadblockShape,
+                                               WarpShape,
+                                               typename GemmType::InstructionShape,
+                                               typename DefaultConfig::EpilogueOutputOp,
+                                               void,  // ThreadblockSwizzle - not used
+                                               DefaultConfig::kStages,
+                                               false,  // SplitKSerial
+                                               typename GemmType::Operator>;
+
+        using DefaultMmaFromSmem = typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            typename MatmulDOIVJ::AccumulatorSharedStorage,
+            false>;  // kScaleOperandA
+        using Mma = typename DefaultMmaFromSmem::Mma;
+        using IteratorB = typename Mma::IteratorB;
+        using WarpCount = typename Mma::WarpCount;
+
+        // Epilogue
+        using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp;
+        using DefaultEpilogue = typename DefaultGemm::Epilogue;
+        using OutputTileIterator =
+            typename cutlass::epilogue::threadblock::MakePrefetchableIterator<
+                typename DefaultEpilogue::OutputTileIterator>::Iterator;
+        using AccumTileGmem = GmemTile<typename Mma::FragmentC, (int)kNumThreads>;
+    };
+    struct MatmulGradK {
+        // grad_k <- tmp.transpose(-2, -1) @ q_i
+        using ThreadblockShape =
+            cutlass::gemm::GemmShape<kBlockSizeJ, kBlockSizeI, GemmType::ThreadK>;
+        using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+        using InstructionShape = typename GemmType::InstructionShape;
+
+        using DefaultGemm =
+            cutlass::gemm::kernel::DefaultGemm<scalar_t,                   // ElementA,
+                                               cutlass::layout::RowMajor,  // LayoutA,
+                                               DefaultConfig::kAlignmentA,
+                                               scalar_t,                   // ElementB,
+                                               cutlass::layout::RowMajor,  // LayoutB,
+                                               kIsAligned ? DefaultConfig::kAlignmentB
+                                                          : GemmType::kMinimumAlignment,
+                                               output_t,
+                                               cutlass::layout::RowMajor,  // LayoutC,
+                                               accum_t,
+                                               typename GemmType::OpClass,
+                                               ArchTag,
+                                               ThreadblockShape,
+                                               WarpShape,
+                                               typename GemmType::InstructionShape,
+                                               typename DefaultConfig::EpilogueOutputOp,
+                                               void,  // ThreadblockSwizzle - not used
+                                               DefaultConfig::kStages,
+                                               false,  // SplitKSerial
+                                               typename GemmType::Operator>;
+
+        using DefaultMmaFromSmemN = typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            typename MatmulQK::AccumulatorSharedStorage,
+            false>;  // kScaleOperandA
+        using DefaultMmaFromSmemT = typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            typename MatmulDOIVJ::AccumulatorSharedStorage,
+            false,      // kScaleOperandA
+            kPreload>;  // kTransposeA
+        using DefaultMmaFromSmem =
+            typename cutlass::platform::conditional<DefaultMmaFromSmemT::kIsTransposedA,
+                                                    DefaultMmaFromSmemT,
+                                                    DefaultMmaFromSmemN>::type;
+        using Mma = typename DefaultMmaFromSmem::Mma;
+        using IteratorB = typename Mma::IteratorB;
+        using WarpCount = typename Mma::WarpCount;
+
+        // Epilogue
+        using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp;
+        using DefaultEpilogue = typename DefaultGemm::Epilogue;
+        using OutputTileIterator =
+            typename cutlass::epilogue::threadblock::MakePrefetchableIterator<
+                typename DefaultEpilogue::OutputTileIterator>::Iterator;
+        using AccumTileGmem = GmemTile<typename Mma::FragmentC, (int)kNumThreads>;
+    };
+
+    using broadcast_1 = Broadcast1_<typename MatmulQK::BiasLoader::ThreadMap,
+                                    typename MatmulQK::BiasLoader::Shape,
+                                    scalar_t>;
+    using broadcast_2 = Broadcast2_<typename MatmulQK::BiasLoader::ThreadMap,
+                                    typename MatmulQK::BiasLoader::Shape,
+                                    scalar_t>;
+
+    // shared storage for keeping Zij matrix. not needed if we aren't using
+    // dropout, in which case we use an empty array to save shared memory
+    using ZijSharedStorage = typename cutlass::platform::conditional<
+        kApplyDropout,
+        typename MatmulQK::AccumulatorSharedStorage,
+        // dummy shared storage object that takes up no space.
+        typename cutlass::gemm::threadblock::AccumulatorSharedStorage<
+#ifdef _WIN32
+            // windows builds throw the error:
+            // "type containing an unknown-size array is not allowed"
+            // if we try to make Zij shared storage zero-sized.
+            // To get around this just make it sized 1 on windows.
+            typename cutlass::gemm::GemmShape<1, 1, 0>,
+#else
+            typename cutlass::gemm::GemmShape<0, 0, 0>,
+#endif
+            typename MatmulQK::AccumulatorSharedStorage::Element,
+            typename MatmulQK::AccumulatorSharedStorage::Layout,
+            typename cutlass::MatrixShape<0, 0>>>::type;
+
+    struct SharedStoragePrologue {
+        struct {
+            cutlass::Array<accum_t, kBlockSizeI> di;  // (do_i * o_i).sum(-1)
+            typename MatmulQK::Mma::SharedStorageA mm_qk_k;
+        } persistent;
+        union {
+            struct {
+                // part1 - after Q.K / dV / dO.V
+                union {
+                    // 1. efficient load of bias tile Bij, which is then applied to Pij
+                    // typename MatmulQK::BiasLoader::SmemTile bias;
+                    cutlass::AlignedBuffer<float, MatmulQK::BiasLoader::Shape::kCount> bias;
+                    // 4. store Pij. it is needed:
+                    // - in dVj += (Pij.T * Zij) @ dOi
+                    // - in dSij = Pij * (dPij - Di)
+                    // 6. dVj += (Pij.T * Zij) @ dOi
+                    // 10. write to fragment
+                    typename MatmulQK::AccumulatorSharedStorage attn_shared_storage;
+                };
+                // 5. store Zij. it is needed:
+                // - to compute Pij_dropped = Pij * Zij on the fly as fragments of Pij
+                // are loaded for the computation of dVj.
+                // - to compute dPij = (dOi @ Vj.T) * Zij
+                // 6. used in dVj += (Pij.T * Zij) @ dOi
+                // 9. used in dPij = dPij_dropped * Zij
+                ZijSharedStorage zij;
+
+                union {
+                    // 2. prologue for dVj
+                    // 6. workspace for dVj += (Pij.T * Zij) @ dOi
+                    typename MatmulGradV::Mma::SharedStorage mm_gradV;
+                    // 7. dVj epilogue
+                    typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue;
+                };
+
+                // 3. prologue for dPij_dropped
+                // 8. used in dPij_dropped = dOi @ Vj.T
+                typename MatmulDOIVJ::Mma::SharedStorage mm_doivj;
+            } part1;
+
+            struct {
+                // part2 - dQ
+                union {
+                    typename MatmulQK::AccumulatorSharedStorage
+                        tmpT_shared_storage;  // (from part1)
+                    typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+                };
+                typename MatmulGradK::Mma::SharedStorage mm_gradK;  // (preload)
+                typename MatmulGradQ::Mma::SharedStorage mm_gradQ;  // (preload)
+                union {
+                    // store dB = dSij to global memory
+                    typename MatmulDOIVJ::BiasGradEpilogue::SharedStorage gradB_epilogue;
+                    typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue;
+                };
+
+            } part2;
+
+            struct {
+                // part3 - after last iteration on dQ's epilogue / dK
+                union {
+                    typename MatmulQK::AccumulatorSharedStorage
+                        tmpT_shared_storage;  // (from part1)
+                    typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+                };
+                typename MatmulGradK::Mma::SharedStorage mm_gradK;  // (preload)
+                typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue_lastIter;
+
+                typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue;
+            } part3;
+
+            struct {
+                // part4 - after last iteration on dK's epilogue / preload next K.Q_t
+                typename MatmulQK::Mma::SharedStorageB mm_qk_q;
+
+                // If we reach end of current key, dump RF->gmem with "final" epilogues
+                typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue_final;
+                typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue_final;
+            } part4;
+        };
+// ===========================================
+#define FIELD(INSIDE_STRUCT, FIELDNAME) \
+    CUTLASS_DEVICE auto& FIELDNAME() { return INSIDE_STRUCT.FIELDNAME; }
+
+        FIELD(persistent, di)
+        FIELD(persistent, mm_qk_k)
+        FIELD(part1, bias)
+        FIELD(part1, attn_shared_storage)
+        FIELD(part1, zij)
+        FIELD(part1, mm_gradV)
+        FIELD(part1, gradV_epilogue)
+        FIELD(part1, mm_doivj)
+        FIELD(part2, mm_gradK)
+        FIELD(part2, mm_gradQ)
+        FIELD(part2, gradB_epilogue)
+        FIELD(part2, gradQ_epilogue)
+        FIELD(part2, tmp_shared_storage)
+        FIELD(part3, tmpT_shared_storage)
+        FIELD(part3, gradQ_epilogue_lastIter)
+        FIELD(part3, gradK_epilogue)
+        FIELD(part4, mm_qk_q)
+        FIELD(part4, gradK_epilogue_final)
+        FIELD(part4, gradV_epilogue_final)
+    };
+
+    struct SharedStorageNoPrologue {
+        struct {
+            cutlass::Array<accum_t, kBlockSizeI> di;  // (do_i * o_i).sum(-1)
+        } persistent;
+        union {
+            struct {
+                // part1 - Q.K matmul
+                typename MatmulQK::Mma::SharedStorageA mm_qk_k;
+                typename MatmulQK::Mma::SharedStorageB mm_qk_q;
+            } part1;
+
+            struct {
+                // part2 - compute gradV
+                union {
+                    // 1. efficient load of bias tile Bij, which is then applied to Pij
+                    cutlass::AlignedBuffer<float, MatmulQK::BiasLoader::Shape::kCount> bias;
+                    // 2. store Pij to shared memory. it is needed:
+                    // - in this step, where it is used in dVj += (Pij.T * Zij) @ dOi
+                    // - in next step where it is used in dSij = Pij * (dPij - Di)
+                    typename MatmulQK::AccumulatorSharedStorage attn_shared_storage;
+                };
+                // 3. store Zij. it is needed:
+                // - in this step, where it is used to compute Pij_dropped = Pij * Zij
+                // on the
+                //   fly as fragments of Pij are loaded for the computation of dVj.
+                // - later to compute dPij = (dOi @ Vj.T) * Zij
+                ZijSharedStorage zij;
+
+                union {
+                    typename MatmulGradV::Mma::SharedStorage mm_gradV;
+                    typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue;
+                };
+            } part2;
+
+            struct {
+                // part3 - DO.V matmul
+                union {
+                    // first compute dPij = (dOi @ Vj.T) * Zij
+                    // and dSij = Pij * (dPij - Di)
+                    struct {
+                        // (from part2) - Pij for computing dSij = Pij * (dPij - Di)
+                        typename MatmulQK::AccumulatorSharedStorage attn_shared_storage;
+                        // (from part2) - Zij for computing dPij = dPij_dropped * Zij
+                        ZijSharedStorage zij;
+                        // matmul to compute dOiVj
+                        typename MatmulDOIVJ::Mma::SharedStorage mm_doivj;
+                    };
+                    // then store dB = dSij to global memory
+                    typename MatmulDOIVJ::BiasGradEpilogue::SharedStorage gradB_epilogue;
+                };
+            } part3;
+
+            struct {
+                // part4 - compute gradQ
+                typename MatmulQK::AccumulatorSharedStorage tmpT_shared_storage;  // (from part2)
+                typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+                union {
+                    typename MatmulGradQ::Mma::SharedStorage mm_gradQ;
+                    typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue;
+                    typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue_lastIter;
+                };
+            } part4;
+
+            struct {
+                // part5 - compute gradK
+                typename MatmulQK::AccumulatorSharedStorage tmpT_shared_storage;  // (from part2)
+                typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+                union {
+                    typename MatmulGradK::Mma::SharedStorage mm_gradK;
+                    typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue;
+                };
+            } part5;
+
+            struct {
+                // part6 - store RF accumulated into gmem
+                typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue_final;
+                typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue_final;
+            } part6;
+        };
+// ===========================================
+#define FIELD(INSIDE_STRUCT, FIELDNAME) \
+    CUTLASS_DEVICE auto& FIELDNAME() { return INSIDE_STRUCT.FIELDNAME; }
+
+        FIELD(persistent, di)
+        FIELD(part1, mm_qk_k)
+        FIELD(part1, mm_qk_q)
+        FIELD(part2, bias)
+        FIELD(part2, attn_shared_storage)
+        FIELD(part2, zij)
+        FIELD(part2, mm_gradV)
+        FIELD(part2, gradV_epilogue)
+        FIELD(part3, mm_doivj)
+        FIELD(part3, gradB_epilogue)
+        FIELD(part4, tmpT_shared_storage)
+        FIELD(part4, tmp_shared_storage)
+        FIELD(part4, mm_gradQ)
+        FIELD(part4, gradQ_epilogue)
+        FIELD(part4, gradQ_epilogue_lastIter)
+        FIELD(part5, mm_gradK)
+        FIELD(part5, gradK_epilogue)
+        FIELD(part6, gradK_epilogue_final)
+        FIELD(part6, gradV_epilogue_final)
+    };
+
+    using SharedStorage = typename cutlass::platform::
+        conditional<kPreload, SharedStoragePrologue, SharedStorageNoPrologue>::type;
+
+    struct OutputFragments {
+        typename MatmulGradV::Mma::FragmentC gradV;
+        typename MatmulGradK::Mma::FragmentC gradK;
+
+        CUTLASS_DEVICE void clear()
+        {
+            gradV.clear();
+            gradK.clear();
+        }
+    };
+
+    static bool __host__ check_supported(Params const& p)
+    {
+        CHECK_ALIGNED_PTR(p.query_ptr, kMinimumAlignment);
+        CHECK_ALIGNED_PTR(p.key_ptr, kMinimumAlignment);
+        CHECK_ALIGNED_PTR(p.value_ptr, kMinimumAlignment);
+        CHECK_ALIGNED_PTR(p.output_ptr, kMinimumAlignment);
+        CHECK_ALIGNED_PTR(p.grad_output_ptr, kMinimumAlignment);
+        EVOFORMER_CHECK(p.lse_strideH % 8 == 0, "LSE is not correctly aligned");
+        EVOFORMER_CHECK(p.lse_strideB % 8 == 0, "LSE is not correctly aligned");
+        EVOFORMER_CHECK(p.num_heads <= 1 || p.q_strideH % kMinimumAlignment == 0,
+                        "query is not correctly aligned (strideH)");
+        EVOFORMER_CHECK(p.num_heads <= 1 || p.k_strideH % kMinimumAlignment == 0,
+                        "key is not correctly aligned (strideH)");
+        EVOFORMER_CHECK(p.num_heads <= 1 || p.v_strideH % kMinimumAlignment == 0,
+                        "value is not correctly aligned (strideH)");
+        EVOFORMER_CHECK(p.num_batches <= 1 || p.q_strideB % kMinimumAlignment == 0,
+                        "query is not correctly aligned (strideB)");
+        EVOFORMER_CHECK(p.num_batches <= 1 || p.k_strideB % kMinimumAlignment == 0,
+                        "key is not correctly aligned (strideB)");
+        EVOFORMER_CHECK(p.num_batches <= 1 || p.v_strideB % kMinimumAlignment == 0,
+                        "value is not correctly aligned (strideB)");
+        EVOFORMER_CHECK(p.q_strideM % kMinimumAlignment == 0,
+                        "query is not correctly aligned (strideM)");
+        EVOFORMER_CHECK(p.k_strideM % kMinimumAlignment == 0,
+                        "key is not correctly aligned (strideM)");
+        EVOFORMER_CHECK(p.v_strideM % kMinimumAlignment == 0,
+                        "value is not correctly aligned (strideM)");
+        EVOFORMER_CHECK(p.dropout_prob <= 1.0f && p.dropout_prob >= 0.0f,
+                        "Invalid value for `dropout_prob`");
+        EVOFORMER_CHECK(kApplyDropout || p.dropout_prob == 0.0f,
+                        "Set `kApplyDropout`=True to support `dropout_prob > 0`");
+        EVOFORMER_CHECK(p.head_dim > 0, "Invalid value for `head_dim`");
+        EVOFORMER_CHECK(p.head_dim_value > 0, "Invalid value for `head_dim_value`");
+        EVOFORMER_CHECK(p.num_queries > 0, "Invalid value for `num_queries`");
+        EVOFORMER_CHECK(p.num_keys > 0, "Invalid value for `num_keys`");
+        EVOFORMER_CHECK(p.num_heads > 0, "Invalid value for `num_heads`");
+        EVOFORMER_CHECK(p.num_batches > 0, "Invalid value for `num_batches`");
+        EVOFORMER_CHECK(p.head_dim <= kMaxK, "kMaxK: Expected `head_dim < kMaxK`");
+        EVOFORMER_CHECK(p.head_dim_value <= kMaxK, "kMaxK: Expected `head_dim_value < kMaxK`");
+        return true;
+    }
+
+    static CUTLASS_DEVICE void attention_kernel(Params p)
+    {
+        extern __shared__ char smem_buffer[];
+        SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
+
+        uint16_t thread_id = threadIdx.x;
+        uint8_t warp_id = warp_uniform(thread_id / 32);
+        uint8_t lane_id = thread_id % 32;
+
+        if (kPrologueQK) {
+            prologueQkNextIteration<true>(shared_storage, p, 0, 0, warp_id, lane_id);
+        }
+
+        // Computes (dO*out).sum(-1) and writes it to `p.delta_ptr`
+        if (kKernelComputesDelta) {
+            constexpr int kOptimalElements = 128 / cutlass::sizeof_bits<scalar_t>::value;
+            if (p.head_dim_value % kOptimalElements == 0) {
+                for (int query_start = 0; query_start < p.num_queries; query_start += kBlockSizeI) {
+                    computeDelta<kOptimalElements>(p, query_start, warp_id, lane_id);
+                }
+            } else {
+                for (int query_start = 0; query_start < p.num_queries; query_start += kBlockSizeI) {
+                    computeDelta<1>(p, query_start, warp_id, lane_id);
+                }
+            }
+            __syncthreads();
+        }
+
+        OutputFragments output_frags;
+
+        int32_t key_start = 0;
+        int32_t key_end = p.num_keys / kBlockSizeJ * kBlockSizeJ;
+        for (; key_start < key_end; key_start += kBlockSizeJ) {
+            output_frags.clear();
+            int32_t query_start = getQueryStart(p, key_start);
+            int32_t query_end =
+                query_start + (p.num_queries - query_start) / kBlockSizeI * kBlockSizeI;
+            for (; query_start < query_end; query_start += kBlockSizeI) {
+                processBlockIJ<true>(
+                    shared_storage, output_frags, p, query_start, key_start, warp_id, lane_id);
+            }
+            // last (partial) query
+            if (query_start < p.num_queries) {
+                processBlockIJ<false>(
+                    shared_storage, output_frags, p, query_start, key_start, warp_id, lane_id);
+            }
+            if (kOutputInRF) {
+                writeFragsToGmem<true>(
+                    shared_storage, output_frags, p, key_start, warp_id, lane_id);
+            } else if (getQueryStart(p, key_start) >= p.num_queries) {
+                zfillGradKV<true>(p, key_start, warp_id, lane_id);
+            }
+            __syncthreads();
+        }
+        // Last (partial) key
+        if (key_start != p.num_keys) {
+            output_frags.clear();
+            int32_t query_start = getQueryStart(p, key_start);
+            for (; query_start < p.num_queries; query_start += kBlockSizeI) {
+                warp_id = warp_uniform(warp_id);
+                processBlockIJ<false>(
+                    shared_storage, output_frags, p, query_start, key_start, warp_id, lane_id);
+            }
+            if (kOutputInRF) {
+                writeFragsToGmem<false>(
+                    shared_storage, output_frags, p, key_start, warp_id, lane_id);
+            } else if (getQueryStart(p, key_start) >= p.num_queries) {
+                zfillGradKV<false>(p, key_start, warp_id, lane_id);
+            }
+        }
+    }
+
+    static CUTLASS_DEVICE void loadDi(cutlass::Array<accum_t, kBlockSizeI>& di,
+                                      Params const& p,
+                                      int32_t query_start)
+    {
+        int32_t thread_id = threadIdx.x + threadIdx.y * blockDim.x;
+        if (thread_id < kBlockSizeI) {
+            accum_t di_rf = accum_t(0);
+            if (query_start + thread_id < p.num_queries) {
+                di_rf = p.delta_ptr[query_start + thread_id];
+            }
+            di[thread_id] = di_rf;
+        }
+    }
+
+    template <bool skipBoundsChecks>
+    static CUTLASS_DEVICE void zfillGradKV(Params const& p,
+                                           int32_t key_start,
+                                           uint8_t warp_id,
+                                           uint8_t lane_id)
+    {
+        constexpr int kThreadsPerKey = 8;
+        constexpr int kParallelKeys = kNumThreads / kThreadsPerKey;
+        static_assert(kBlockSizeJ % kParallelKeys == 0, "");
+        // This function is not really optimized, but should rarely be used
+        // It's only used when some keys are "useless" and don't attend to
+        // any query, due to causal masking
+        int thread_id = 32 * warp_id + lane_id;
+        int k_shift = lane_id % kThreadsPerKey;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < kBlockSizeJ; j += kParallelKeys) {
+            int key = key_start + j + (thread_id / kThreadsPerKey);
+            if (!skipBoundsChecks && key >= p.num_keys) { continue; }
+            auto gv_ptr = p.grad_value_ptr + key * p.gV_strideM();
+            auto gk_ptr = p.grad_key_ptr + key * p.gK_strideM();
+
+            for (int k = k_shift; k < p.head_dim_value; k += kThreadsPerKey) {
+                gv_ptr[k] = scalar_t(0);
+            }
+            for (int k = k_shift; k < p.head_dim; k += kThreadsPerKey) { gk_ptr[k] = scalar_t(0); }
+        }
+    }
+
+    template <bool skipBoundsChecks>
+    static CUTLASS_DEVICE void processBlockIJ(SharedStorage& shared_storage,
+                                              OutputFragments& output_frags,
+                                              Params& p,
+                                              int32_t query_start,
+                                              int32_t key_start,
+                                              uint8_t warp_id,
+                                              uint8_t lane_id)
+    {
+        cutlass::MatrixCoord no_offset{0, 0};
+        accum_t scale = p.scale;
+        int16_t thread_id = 32 * warp_id + lane_id;
+        auto rematerializeThreadIds = [&]() {
+            // Prevents `nvcc` from keeping values deduced from
+            // `thread_id`, `warp_id`, ... in RF - to reduce register pressure
+            warp_id = warp_uniform(thread_id / 32);
+            lane_id = thread_id % 32;
+            thread_id = 32 * warp_id + lane_id;
+        };
+
+        bool isFirstQuery = (query_start == getQueryStart(p, key_start));
+        int32_t next_query, next_key;
+        incrIteration(p, query_start, key_start, next_query, next_key);
+        bool isLastQuery = next_key != key_start;
+        __syncthreads();
+        loadDi(shared_storage.di(), p, query_start);
+
+        int32_t num_queries_in_block =
+            skipBoundsChecks ? MatmulQK::Mma::Shape::kN
+                             : warp_uniform(cutlass::fast_min((int32_t)MatmulQK::Mma::Shape::kN,
+                                                              p.num_queries - query_start));
+        int32_t num_keys_in_block =
+            skipBoundsChecks ? MatmulQK::Mma::Shape::kM
+                             : warp_uniform(cutlass::fast_min((int32_t)MatmulQK::Mma::Shape::kM,
+                                                              p.num_keys - key_start));
+
+        auto prologueGradV = [&](int col) {
+            typename MatmulGradV::Mma::IteratorB iterator_dO(
+                {int32_t(p.gO_strideM)},
+                p.grad_output_ptr + query_start * p.gO_strideM + col,
+                {num_queries_in_block, p.head_dim_value - col},
+                thread_id,
+                no_offset);
+            MatmulGradV::Mma::prologue(
+                shared_storage.mm_gradV(), iterator_dO, thread_id, num_queries_in_block);
+        };
+        auto prologueGradQ = [&](int col) {
+            typename MatmulGradQ::Mma::IteratorB iterator_K(
+                {int32_t(p.k_strideM)},
+                p.key_ptr + key_start * p.k_strideM + col,
+                {num_keys_in_block, p.head_dim - col},
+                thread_id,
+                no_offset);
+            MatmulGradQ::Mma::prologue(
+                shared_storage.mm_gradQ(), iterator_K, thread_id, num_keys_in_block);
+        };
+        auto prologueGradK = [&](int col) {
+            typename MatmulGradK::Mma::IteratorB iterator_Q(
+                {int32_t(p.q_strideM)},
+                p.query_ptr + query_start * p.q_strideM + col,
+                {num_queries_in_block, p.head_dim - col},
+                thread_id,
+                no_offset);
+            MatmulGradK::Mma::prologue(
+                shared_storage.mm_gradK(), iterator_Q, thread_id, num_queries_in_block);
+        };
+        auto prologueDOV = [&]() {
+            typename MatmulDOIVJ::Mma::IteratorA iterator_A(
+                {int32_t(p.gO_strideM)},
+                p.grad_output_ptr + query_start * p.gO_strideM,
+                {num_queries_in_block, p.head_dim_value},
+                thread_id,
+                no_offset);
+            typename MatmulDOIVJ::Mma::IteratorB iterator_B({int32_t(p.v_strideM)},
+                                                            p.value_ptr + key_start * p.v_strideM,
+                                                            {p.head_dim_value, num_keys_in_block},
+                                                            thread_id,
+                                                            no_offset);
+            MatmulDOIVJ::Mma::prologue(
+                shared_storage.mm_doivj(), iterator_A, iterator_B, thread_id, p.head_dim_value);
+        };
+
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        // MatmulQK
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        {
+            using Mma = typename MatmulQK::Mma;
+
+            cutlass::gemm::GemmCoord problem_size(num_keys_in_block,
+                                                  num_queries_in_block,
+                                                  p.head_dim  // k
+            );
+
+            // k_j
+            typename Mma::IteratorA iterator_A({int32_t(p.k_strideM)},
+                                               p.key_ptr + key_start * p.k_strideM,
+                                               {problem_size.m(), problem_size.k()},
+                                               thread_id,
+                                               no_offset);
+
+            // q_i.transpose(-2, -1)
+            typename Mma::IteratorB iterator_B({int32_t(p.q_strideM)},
+                                               p.query_ptr + query_start * p.q_strideM,
+                                               {problem_size.k(), problem_size.n()},
+                                               thread_id,
+                                               no_offset);
+
+            Mma mma(
+                shared_storage.mm_qk_k(), shared_storage.mm_qk_q(), thread_id, warp_id, lane_id);
+
+            typename Mma::FragmentC accum;
+
+            accum.clear();
+
+            auto gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+            // Compute threadblock-scoped matrix multiply-add
+            mma.set_prologue_done(kPrologueQK);
+            mma.set_zero_outside_bounds(!skipBoundsChecks);
+            mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+
+            // Epilogue: add LSE + exp and store that to our shared memory buffer
+            // shmem <- (matmul_result -
+            // logsumexp[i_start:i_end].unsqueeze(1)).exp()
+            int warp_idx_mn_0 = warp_id % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN);
+            auto output_tile_coords = cutlass::MatrixCoord{
+                warp_idx_mn_0 % Mma::Base::WarpCount::kM, warp_idx_mn_0 / Mma::Base::WarpCount::kM};
+
+            if (broadcast_1::kEnable || broadcast_2::kEnable) {
+                cutlass::TensorRef<float, cutlass::layout::RowMajor> bias_tensor_ref(
+                    shared_storage.bias().data(),
+                    cutlass::layout::RowMajor(MatmulQK::ThreadblockShape::kM));
+                using Shape = cutlass::MatrixShape<MatmulQK::ThreadblockShape::kM,
+                                                   MatmulQK::ThreadblockShape::kN>;
+                AttentionBiasEpilogue<Shape,
+                                      scalar_t,
+                                      MatmulQK::MmaCore::kThreads,
+                                      Broadcast1_,
+                                      Broadcast2_>
+                    bias_epilogue;
+                bias_epilogue(bias_tensor_ref,
+                              p.bias1_ptr + key_start,
+                              p.bias2_ptr + query_start * p.num_keys + key_start,
+                              thread_id,
+                              {num_queries_in_block, num_keys_in_block},
+                              p.num_keys);
+                // Pij += Bij, Pij is in register fragment and Bij is in shared memory
+                auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset(
+                    lane_id, warp_id, output_tile_coords);
+                MatmulQK::AccumLambdaIterator::iterateRows(
+                    lane_offset,
+                    [&](int accum_n) {},
+                    [&](int accum_m, int accum_n, int idx) {
+                        // remember we are transposed
+                        accum[idx] = accum[idx] * scale + bias_tensor_ref.at({accum_n, accum_m});
+                    },
+                    [&](int accum_n) {});
+            } else {
+                accum = cutlass::multiplies<typename Mma::FragmentC>()(scale, accum);
+            }
+
+            __syncthreads();
+            if (kPrologueGV) { prologueGradV(0); }
+            if (kPrologueDOV) { prologueDOV(); }
+
+            MatmulQK::B2bGemm::accumApplyLSEToSmem(shared_storage.attn_shared_storage(),
+                                                   accum,
+                                                   p.logsumexp_ptr + query_start,
+                                                   problem_size.n(),
+                                                   thread_id,
+                                                   warp_id,
+                                                   lane_id,
+                                                   output_tile_coords);
+
+            __syncthreads();
+        }
+        rematerializeThreadIds();
+
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        // GradV matmul
+        //
+        // grad_v[j_start:j_end] += attn_T @ do_i
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        constexpr bool kSingleIterationGradV = kMaxK <= MatmulGradV::ThreadblockShape::kN;
+        for (int col = 0; col < (kSingleIterationGradV ? 1 : p.head_dim_value);
+             col += MatmulGradV::ThreadblockShape::kN) {
+            using Mma = typename MatmulGradV::Mma;
+            using AccumTileGmem = typename MatmulGradQ::AccumTileGmem;
+
+            cutlass::gemm::GemmCoord problem_size(
+                num_keys_in_block, p.head_dim_value - col, num_queries_in_block);
+            auto createEpilogueIter = [&]() {
+                return typename MatmulGradV::OutputTileIterator(
+                    typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
+                    p.grad_value_ptr + key_start * p.gV_strideM() + col,
+                    {num_keys_in_block, p.head_dim_value - col},
+                    thread_id);
+            };
+            typename Mma::IteratorB iterator_B({int32_t(p.gO_strideM)},
+                                               p.grad_output_ptr + query_start * p.gO_strideM + col,
+                                               {num_queries_in_block, p.head_dim_value - col},
+                                               thread_id,
+                                               no_offset);
+
+            // if dropout: dVj += (Pij.T * Zij) @ dOi
+            // otherwise:  dVj += Pij.T @ dOi
+            Mma mma(shared_storage.mm_gradV(),
+                    // operand A: Pij
+                    typename MatmulGradV::WarpIteratorA(
+                        shared_storage.attn_shared_storage().accum_ref(), lane_id),
+                    // if we're using dropout, operand A is Pij_dropped = Pij * Zij
+                    // which is computed on the fly as fragments of Pij are loaded in
+                    typename Mma::WarpIteratorAScale(shared_storage.zij().accum_ref(), lane_id),
+                    thread_id,
+                    warp_id,
+                    lane_id);
+
+            int storage_id = col / MatmulGradV::ThreadblockShape::kN;
+            AccumTileGmem gmem_tile{p.workspace_gv + storage_id * AccumTileGmem::kElementsStored};
+            if (!kOutputInRF) {
+                if (isFirstQuery || !kNeedsAccumGradV) {
+                    output_frags.gradV.clear();
+                } else {
+                    gmem_tile.load(output_frags.gradV, thread_id);
+                }
+            }
+            mma.set_prologue_done(kPrologueGV);
+
+            auto gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+            // Compute threadblock-scoped matrix multiply-add
+            __syncthreads();
+
+            mma(gemm_k_iterations, output_frags.gradV, iterator_B, output_frags.gradV);
+            __syncthreads();
+            if (kPrologueGV && !kSingleIterationGradV &&
+                col + MatmulGradV::ThreadblockShape::kN < p.head_dim_value) {
+                prologueGradV(col + MatmulGradV::ThreadblockShape::kN);
+            }
+
+            if (!kOutputInRF) {
+                if (kNeedsAccumGradV && !isLastQuery) {
+                    gmem_tile.store(output_frags.gradV, thread_id);
+                } else {
+                    accumulateInGmem<MatmulGradV>(shared_storage.gradV_epilogue(),
+                                                  output_frags.gradV,
+                                                  createEpilogueIter(),
+                                                  isFirstQuery || kNeedsAccumGradV,
+                                                  warp_id,
+                                                  lane_id);
+                }
+            }
+        }
+        __syncthreads();
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        // MatmulDOIVJ
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        {
+            using Mma = typename MatmulDOIVJ::Mma;
+            // do_i
+            typename Mma::IteratorA iterator_A({int32_t(p.gO_strideM)},
+                                               p.grad_output_ptr + query_start * p.gO_strideM,
+                                               {num_queries_in_block, p.head_dim_value},
+                                               thread_id,
+                                               no_offset);
+
+            // v_j.transpose(-2, -1)
+            typename Mma::IteratorB iterator_B({int32_t(p.v_strideM)},
+                                               p.value_ptr + key_start * p.v_strideM,
+                                               {p.head_dim_value, num_keys_in_block},
+                                               thread_id,
+                                               no_offset);
+
+            Mma mma(shared_storage.mm_doivj(), thread_id, warp_id, lane_id);
+            mma.set_prologue_done(kPrologueDOV);
+            mma.set_zero_outside_bounds(!skipBoundsChecks);
+
+            typename Mma::FragmentC accum;
+
+            accum.clear();
+
+            auto gemm_k_iterations = (p.head_dim_value + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+            // Compute threadblock-scoped matrix multiply-add
+            mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+            __syncthreads();
+            if (kPrologueGQ) { prologueGradQ(0); }
+            if (kPrologueGK) { prologueGradK(0); }
+
+            int warp_idx_mn_0 = warp_id % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN);
+            auto output_tile_coords = cutlass::MatrixCoord{
+                warp_idx_mn_0 % Mma::Base::WarpCount::kM, warp_idx_mn_0 / Mma::Base::WarpCount::kM};
+            // TODO: This must be terribly inefficient. There must be a better way
+            // tmp [RF] <- (accum [RF] - Di [smem] ) * attn_T.T [smem]
+            // attn_shared_storage  [smem] <- tmp.T
+            // tmp_shared_storage [smem] <- tmp
+            {
+                using LambdaIterator =
+                    typename DefaultMmaAccumLambdaIterator<typename Mma::Operator::IteratorC,
+                                                           typename MatmulDOIVJ::ElementAccum,
+                                                           kWarpSize>::Iterator;
+                auto lane_offset =
+                    LambdaIterator::get_lane_offset(lane_id, warp_id, output_tile_coords);
+
+                auto attn_T = shared_storage.attn_shared_storage().accum_ref();
+                accum_t current_di;
+                // dSij = (dPij - Di) * Pij
+                LambdaIterator::iterateRows(
+                    lane_offset,
+                    [&](int accum_m) { current_di = shared_storage.di()[accum_m]; },
+                    [&](int accum_m, int accum_n, int idx) {
+                        if (skipBoundsChecks ||
+                            (accum_m < num_queries_in_block && accum_n < num_keys_in_block)) {
+                            accum_t attn = attn_T.at({accum_n, accum_m});
+                            accum[idx] = (accum[idx] - current_di) * attn;
+                        } else {
+                            accum[idx] = 0;
+                        }
+                    },
+                    [&](int accum_m) {
+
+                    });
+
+                using DefaultGemm = typename MatmulDOIVJ::DefaultGemm;
+                using OutputOp = typename MatmulDOIVJ::BiasGradEpilogueOutputOp;
+                if (broadcast_1::kEnable && p.grad_bias1_ptr) {
+                    using Epilogue =
+                        typename BiasGradEpilogueAffineRankN<ArchTag,
+                                                             2,
+                                                             typename MatmulDOIVJ::ThreadblockShape,
+                                                             typename DefaultGemm::Mma::Operator,
+                                                             DefaultGemm::kPartitionsK,
+                                                             OutputOp,
+                                                             OutputOp::kCount>::Epilogue;
+                    cutlass::layout::AffineRankN<2> layout({0, 1});
+                    auto dst_ptr = p.grad_bias1_ptr + key_start;
+                    typename Epilogue::OutputTileIterator output_iter(
+                        {layout},
+                        dst_ptr,
+                        {num_queries_in_block, num_keys_in_block},
+                        (int)thread_id);
+                    Epilogue epilogue(shared_storage.gradB_epilogue(),
+                                      (int)thread_id,
+                                      (int)warp_id,
+                                      (int)lane_id);
+                    epilogue(OutputOp(1), output_iter, accum);
+                }
+
+                if (broadcast_2::kEnable && p.grad_bias2_ptr) {
+                    if (broadcast_1::kEnable) { __syncthreads(); }
+                    using Epilogue =
+                        typename BiasGradEpilogue<ArchTag,
+                                                  typename MatmulDOIVJ::ThreadblockShape,
+                                                  typename DefaultGemm::Mma::Operator,
+                                                  DefaultGemm::kPartitionsK,
+                                                  OutputOp,
+                                                  OutputOp::kCount>::Epilogue;
+                    typename Epilogue::OutputTileIterator::Params params{p.num_keys};
+                    auto dst_ptr = p.grad_bias2_ptr + query_start * p.num_keys + key_start;
+                    typename Epilogue::OutputTileIterator output_iter(
+                        params, dst_ptr, {num_queries_in_block, num_keys_in_block}, (int)thread_id);
+                    Epilogue epilogue(shared_storage.gradB_epilogue(),
+                                      (int)thread_id,
+                                      (int)warp_id,
+                                      (int)lane_id);
+                    epilogue(OutputOp(1), output_iter, accum);
+                }
+
+                accum = accum * scale;
+
+                __syncthreads();
+                if (!MatmulGradK::DefaultMmaFromSmem::kIsTransposedA) {
+                    auto tmpT = shared_storage.tmpT_shared_storage().accum_ref();
+                    // attn <- attn_T.T
+                    LambdaIterator::iterateRows(
+                        lane_offset,
+                        [&](int accum_m) {},
+                        [&](int accum_m, int accum_n, int idx) {
+                            tmpT.at({accum_n, accum_m}) = scalar_t(accum[idx]);
+                        },
+                        [&](int accum_m) {});
+                }
+            }
+
+            MatmulDOIVJ::B2bGemm::accumToSmem(
+                shared_storage.tmp_shared_storage(), accum, lane_id, output_tile_coords);
+            __syncthreads();
+        }
+        p.head_dim = warp_uniform(p.head_dim);
+        p.k_strideM = warp_uniform(p.k_strideM);
+        rematerializeThreadIds();
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        // GradQ matmul
+        //
+        // grad_q[i_start:i_end] += tmp @ k_j
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        // Skip the loop & associated branches if we know at compile time the number
+        // of iterations
+        constexpr bool kSingleIterationGradQ = kMaxK <= MatmulGradQ::ThreadblockShape::kN;
+        for (int col = 0; col < (kSingleIterationGradQ ? 1 : p.head_dim);
+             col += MatmulGradQ::ThreadblockShape::kN) {
+            using Mma = typename MatmulGradQ::Mma;
+            using AccumTileGmem = typename MatmulGradQ::AccumTileGmem;
+
+            cutlass::gemm::GemmCoord problem_size(
+                num_queries_in_block,
+                false ? MatmulGradQ::ThreadblockShape::kN : p.head_dim - col,
+                num_keys_in_block);
+
+            // k_j
+            typename Mma::IteratorB iterator_B({int32_t(p.k_strideM)},
+                                               p.key_ptr + key_start * p.k_strideM + col,
+                                               {problem_size.k(), problem_size.n()},
+                                               thread_id,
+                                               no_offset);
+
+            auto a = shared_storage.tmp_shared_storage().accum_ref();
+            Mma mma(shared_storage.mm_gradQ(),
+                    shared_storage.tmp_shared_storage(),
+                    thread_id,
+                    warp_id,
+                    lane_id,
+                    problem_size.k());
+
+            typename Mma::FragmentC accum;
+
+            bool isFirst = key_start == 0;
+            int col_id = col / MatmulGradQ::ThreadblockShape::kN;
+            int num_cols =
+                kSingleIterationGradQ ? 1 : ceil_div(p.head_dim, MatmulGradQ::ThreadblockShape::kN);
+            int storage_id = (col_id + query_start / kBlockSizeI * num_cols);
+            AccumTileGmem gmem_tile{p.workspace_gq + storage_id * AccumTileGmem::kElementsStored};
+            if (isFirst || !kNeedsAccumGradQ) {
+                accum.clear();
+            } else {
+                gmem_tile.load(accum, thread_id);
+            }
+
+            auto gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+            // Compute threadblock-scoped matrix multiply-add
+            __syncthreads();
+            mma.set_prologue_done(kPrologueGQ);
+            mma(gemm_k_iterations, accum, iterator_B, accum);
+            __syncthreads();
+            bool isLastColumn = kSingleIterationGradQ ||
+                                (col + MatmulGradQ::ThreadblockShape::kN >= p.head_dim);
+            if (kPrologueGQ && !isLastColumn) {
+                prologueGradQ(col + MatmulGradQ::ThreadblockShape::kN);
+            }
+
+            // Output results
+            int32_t next_query, next_key;
+            incrIteration(p, p.num_queries, key_start, next_query, next_key);
+            bool isLast = next_query > query_start || next_key >= p.num_keys;
+            if (kNeedsAccumGradQ && !isLast) {
+                gmem_tile.store(accum, thread_id);
+            } else {
+                typename MatmulGradQ::OutputTileIterator output_it(
+                    typename MatmulGradQ::OutputTileIterator::Params{p.gQ_strideM()},
+                    p.grad_query_ptr + query_start * p.gQ_strideM() + col,
+                    {problem_size.m(), problem_size.n()},
+                    thread_id);
+                accumulateInGmem<MatmulGradQ>(isLastColumn
+                                                  ? shared_storage.gradQ_epilogue_lastIter()
+                                                  : shared_storage.gradQ_epilogue(),
+                                              accum,
+                                              output_it,
+                                              isFirst || kNeedsAccumGradQ,
+                                              warp_id,
+                                              lane_id);
+            }
+        }
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        // GradK matmul
+        //
+        // grad_k[i_start:i_end] += tmp.transpose(-2, -1) @ q_i
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+        rematerializeThreadIds();
+
+        constexpr bool kSingleIterationGradK = kMaxK <= MatmulGradK::ThreadblockShape::kN;
+        for (int col = 0; col < (kSingleIterationGradK ? 1 : p.head_dim);
+             col += MatmulGradK::ThreadblockShape::kN) {
+            using Mma = typename MatmulGradK::Mma;
+            using AccumTileGmem = typename MatmulGradQ::AccumTileGmem;
+
+            cutlass::gemm::GemmCoord problem_size(
+                num_keys_in_block,
+                false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col,
+                num_queries_in_block);
+            auto createEpilogueIter = [&]() {
+                return typename MatmulGradK::OutputTileIterator(
+                    typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()},
+                    p.grad_key_ptr + key_start * p.gK_strideM() + col,
+                    {num_keys_in_block,
+                     false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col},
+                    thread_id);
+            };
+
+            // q_i
+            typename Mma::IteratorB iterator_B({int32_t(p.q_strideM)},
+                                               p.query_ptr + query_start * p.q_strideM + col,
+                                               {problem_size.k(), problem_size.n()},
+                                               thread_id,
+                                               no_offset);
+
+            auto getTmp = [&](int) { return &shared_storage.tmp_shared_storage(); };
+            auto getTmpT = [&](int) { return &shared_storage.tmpT_shared_storage(); };
+            // this is basically:
+            // opA = kIsTransposedA ? getTmp() : getTmpT();
+            bool constexpr kIsTransposedA = MatmulGradK::DefaultMmaFromSmem::kIsTransposedA;
+            auto& opA =
+                *call_conditional<kIsTransposedA, decltype(getTmp), decltype(getTmpT)>::apply(
+                    getTmp, getTmpT, 0);
+            Mma mma(shared_storage.mm_gradK(), opA, thread_id, warp_id, lane_id, problem_size.k());
+
+            int storage_id = col / MatmulGradK::ThreadblockShape::kN;
+            AccumTileGmem gmem_tile{p.workspace_gk + storage_id * AccumTileGmem::kElementsStored};
+            if (!kOutputInRF) {
+                if (isFirstQuery || !kNeedsAccumGradK) {
+                    output_frags.gradK.clear();
+                } else {
+                    gmem_tile.load(output_frags.gradK, thread_id);
+                }
+            }
+            mma.set_prologue_done(kPrologueGK);
+
+            auto gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+            // Compute threadblock-scoped matrix multiply-add
+            __syncthreads();
+
+            mma(gemm_k_iterations, output_frags.gradK, iterator_B, output_frags.gradK);
+            __syncthreads();
+            bool isLastColumn = kSingleIterationGradK ||
+                                col + MatmulGradK::ThreadblockShape::kN >= p.head_dim;
+            if (kPrologueGK && !isLastColumn) {
+                prologueGradK(col + MatmulGradK::ThreadblockShape::kN);
+            }
+
+            if (kPrologueQK && isLastColumn) {
+                int32_t next_query, next_key;
+                incrIteration(p, query_start, key_start, next_query, next_key);
+                DISPATCH_BOOL(next_key != key_start, kForceReloadK, ([&]() {
+                                  prologueQkNextIteration<kForceReloadK>(
+                                      shared_storage, p, next_query, next_key, warp_id, lane_id);
+                              }));
+            }
+
+            // Output results
+            if (!kOutputInRF) {
+                if (kNeedsAccumGradK && !isLastQuery) {
+                    gmem_tile.store(output_frags.gradK, thread_id);
+                } else {
+                    accumulateInGmem<MatmulGradK>(isLastColumn
+                                                      ? shared_storage.gradK_epilogue_final()
+                                                      : shared_storage.gradK_epilogue(),
+                                                  output_frags.gradK,
+                                                  createEpilogueIter(),
+                                                  isFirstQuery || kNeedsAccumGradK,
+                                                  warp_id,
+                                                  lane_id);
+                    __syncthreads();
+                }
+            }
+        }
+    }
+
+    static CUTLASS_DEVICE int32_t getQueryStart(Params const& p, int32_t key_start) { return 0; };
+
+    static CUTLASS_DEVICE void incrIteration(Params const& p,
+                                             int32_t query_start,
+                                             int32_t key_start,
+                                             int32_t& next_query,
+                                             int32_t& next_key)
+    {
+        next_query = query_start + kBlockSizeI;
+        next_key = key_start;
+        if (next_query >= p.num_queries) {
+            next_key = key_start + kBlockSizeJ;
+            next_query = getQueryStart(p, next_key);
+        }
+    }
+
+    template <bool kForceReloadK>
+    static CUTLASS_DEVICE void prologueQkNextIteration(SharedStorage& shared_storage,
+                                                       Params const& p,
+                                                       int32_t query_start,
+                                                       int32_t key_start,
+                                                       uint8_t warp_id,
+                                                       uint8_t lane_id)
+    {
+        if (query_start >= p.num_queries || key_start >= p.num_keys) { return; }
+
+        static constexpr bool kReloadK = kForceReloadK || !MatmulQK::Mma::kSmemContainsEntireMat;
+        int thread_id = 32 * warp_id + lane_id;
+        typename MatmulQK::Mma::IteratorA iterator_A({int32_t(p.k_strideM)},
+                                                     p.key_ptr + key_start * p.k_strideM,
+                                                     {p.num_keys - key_start, p.head_dim},
+                                                     thread_id,
+                                                     cutlass::MatrixCoord{0, 0});
+
+        typename MatmulQK::Mma::IteratorB iterator_B({int32_t(p.q_strideM)},
+                                                     p.query_ptr + query_start * p.q_strideM,
+                                                     {p.head_dim, p.num_queries - query_start},
+                                                     thread_id,
+                                                     cutlass::MatrixCoord{0, 0});
+
+        MatmulQK::Mma::prologue<kReloadK, true>(shared_storage.mm_qk_k(),
+                                                shared_storage.mm_qk_q(),
+                                                iterator_A,
+                                                iterator_B,
+                                                thread_id,
+                                                p.head_dim);
+    }
+
+    template <bool skipBoundsChecks>
+    static CUTLASS_DEVICE void writeFragsToGmem(SharedStorage& shared_storage,
+                                                OutputFragments& output_frags,
+                                                Params const& p,
+                                                int32_t key_start,
+                                                uint8_t warp_id,
+                                                uint8_t lane_id)
+    {
+        uint16_t thread_id = 32 * warp_id + lane_id;
+        int32_t num_keys_in_block =
+            skipBoundsChecks
+                ? MatmulQK::Mma::Shape::kM
+                : cutlass::fast_min((int32_t)MatmulQK::Mma::Shape::kM, p.num_keys - key_start);
+        typename MatmulGradV::OutputTileIterator outputV_it(
+            typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
+            p.grad_value_ptr + key_start * p.gV_strideM(),
+            {num_keys_in_block, p.head_dim_value},
+            thread_id);
+        accumulateInGmem<MatmulGradV>(shared_storage.gradV_epilogue_final(),
+                                      output_frags.gradV,
+                                      outputV_it,
+                                      true,
+                                      warp_id,
+                                      lane_id);
+
+        typename MatmulGradK::OutputTileIterator outputK_it(
+            typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()},
+            p.grad_key_ptr + key_start * p.gK_strideM(),
+            {num_keys_in_block, false ? MatmulGradK::ThreadblockShape::kN : p.head_dim},
+            thread_id);
+        accumulateInGmem<MatmulGradK>(shared_storage.gradK_epilogue_final(),
+                                      output_frags.gradK,
+                                      outputK_it,
+                                      true,
+                                      warp_id,
+                                      lane_id);
+    }
+
+    template <typename MatmulT>
+    static CUTLASS_DEVICE void accumulateInGmem(
+        typename MatmulT::DefaultEpilogue::SharedStorage& epilogue_smem,
+        typename MatmulT::Mma::FragmentC const& accum,
+        typename MatmulT::OutputTileIterator output_it,
+        bool first,
+        uint8_t warp_id,
+        uint8_t lane_id)
+    {
+        using DefaultEpilogue = typename MatmulT::DefaultEpilogue;
+        using DefaultOutputOp = typename MatmulT::DefaultOutputOp;
+        using Mma = typename MatmulT::Mma;
+        int thread_id = 32 * warp_id + lane_id;
+        DISPATCH_BOOL(
+            first, kIsFirst, ([&]() {
+                static constexpr auto ScaleType =
+                    kIsFirst ? cutlass::epilogue::thread::ScaleType::Nothing
+                             : cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+                using EpilogueOutputOp = typename cutlass::epilogue::thread::LinearCombination<
+                    typename DefaultOutputOp::ElementOutput,
+                    DefaultOutputOp::kCount,
+                    typename DefaultOutputOp::ElementAccumulator,
+                    typename DefaultOutputOp::ElementCompute,
+                    ScaleType>;
+                using Epilogue = typename cutlass::epilogue::threadblock::EpiloguePipelined<
+                    typename DefaultEpilogue::Shape,
+                    typename Mma::Operator,
+                    DefaultEpilogue::kPartitionsK,
+                    typename MatmulT::OutputTileIterator,
+                    typename DefaultEpilogue::AccumulatorFragmentIterator,
+                    typename DefaultEpilogue::WarpTileIterator,
+                    typename DefaultEpilogue::SharedLoadIterator,
+                    EpilogueOutputOp,
+                    typename DefaultEpilogue::Padding,
+                    DefaultEpilogue::kFragmentsPerIteration,
+                    true  // IterationsUnroll
+                    >;
+                EpilogueOutputOp rescale({1, 1});
+                Epilogue epilogue(epilogue_smem, thread_id, warp_id, lane_id);
+                epilogue(rescale, output_it, accum, output_it);
+            }));
+    }
+
+    template <int kElementsPerAccess>
+    static CUTLASS_DEVICE void computeDelta(Params const& p,
+                                            int32_t query_start,
+                                            uint8_t warp_id,
+                                            uint8_t lane_id)
+    {
+        // Each thread computes one value for Delta
+        // Depending on warp configuration, we might have multiple
+        // threads of the same warp working on the same row
+        using AccessType = cutlass::Array<scalar_t, kElementsPerAccess>;
+        static_assert(kNumThreads >= kBlockSizeI, "");
+        static constexpr int kNumThreadsPerLine = kNumThreads / kBlockSizeI;
+        int16_t thread_id = 32 * warp_id + lane_id;
+
+        int16_t laneFirstCol = kElementsPerAccess * (lane_id % kNumThreadsPerLine);
+        int16_t laneRow = thread_id / kNumThreadsPerLine;
+        bool rowPred = (query_start + laneRow) < p.num_queries;
+        bool pred = rowPred;
+
+        // on windows, previous syntax __restrict__ AccessType*
+        // resulted in error: "restrict" is not allowed
+        const AccessType* __restrict__ grad_output_ptr = reinterpret_cast<const AccessType*>(
+            p.grad_output_ptr + (query_start + laneRow) * p.gO_strideM + laneFirstCol);
+        const AccessType* __restrict__ output_ptr = reinterpret_cast<const AccessType*>(
+            p.output_ptr + (query_start + laneRow) * p.o_strideM() + laneFirstCol);
+
+        static constexpr int64_t kMaxIters = kMaxK / (kElementsPerAccess * kNumThreadsPerLine);
+        constexpr int kPipelineStages = 2;
+        accum_t delta_value = accum_t(0);
+        using GlobalLoad = cutlass::arch::global_load<AccessType, sizeof(AccessType)>;
+        AccessType frag_grad_output[kPipelineStages];
+        AccessType frag_output[kPipelineStages];
+
+        auto loadAndIncrement = [&](int ld_pos, bool is_valid) {
+            frag_grad_output[ld_pos].clear();
+            frag_output[ld_pos].clear();
+            GlobalLoad(frag_grad_output[ld_pos], grad_output_ptr, is_valid);
+            GlobalLoad(frag_output[ld_pos], output_ptr, is_valid);
+            grad_output_ptr += kNumThreadsPerLine;
+            output_ptr += kNumThreadsPerLine;
+        };
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int iter = 0; iter < kPipelineStages - 1; ++iter) {
+            int ld_pos = iter % kPipelineStages;
+            pred = pred && (laneFirstCol + iter * kElementsPerAccess * kNumThreadsPerLine) <
+                               p.head_dim_value;
+            loadAndIncrement(ld_pos, pred);
+        }
+        auto columnIteration = [&](int iter) {
+            // Load for next iter
+            int ld_pos = (iter + kPipelineStages - 1) % kPipelineStages;
+            pred = pred && (laneFirstCol + (iter + kPipelineStages - 1) * kElementsPerAccess *
+                                               kNumThreadsPerLine) < p.head_dim_value;
+            loadAndIncrement(ld_pos, pred);
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < AccessType::kElements; ++i) {
+                delta_value += accum_t(frag_output[iter % kPipelineStages][i]) *
+                               accum_t(frag_grad_output[iter % kPipelineStages][i]);
+            }
+        };
+
+        // If we have a small lower-bound for K, we can unroll the loop
+        if (kMaxK <= 256) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int iter = 0; iter < kMaxIters; ++iter) { columnIteration(iter); }
+        } else {
+            int num_iters = ceil_div(p.head_dim_value, kElementsPerAccess * kNumThreadsPerLine) *
+                            (kElementsPerAccess * kNumThreadsPerLine);
+            for (int iter = 0; iter < num_iters; ++iter) { columnIteration(iter); }
+        }
+
+        // Reduce between workers
+        static_assert(kNumThreadsPerLine == 1 || kNumThreadsPerLine == 2 || kNumThreadsPerLine == 4,
+                      "");
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < kNumThreadsPerLine; i *= 2) {
+            delta_value = delta_value + __shfl_xor_sync(0xffffffff, delta_value, i);
+        }
+
+        // Store in gmem
+        if (rowPred) { p.delta_ptr[query_start + laneRow] = delta_value; }
+    }
+};
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_backward_batched_impl(typename AK::Params p)
+{
+    if (!p.advance_to_block()) { return; }
+    AK::attention_kernel(p);
+}
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_backward_batched(typename AK::Params params);
diff --git a/csrc/deepspeed4science/evoformer_attn/kernel_forward.h b/csrc/deepspeed4science/evoformer_attn/kernel_forward.h
new file mode 100644
index 000000000000..e3b11ebcc661
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/kernel_forward.h
@@ -0,0 +1,986 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <curand_kernel.h>
+#include <cmath>
+#include <vector>
+
+#include "cutlass/bfloat16.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "epilogue/epilogue_pipelined.h"
+#include "epilogue/epilogue_rescale_output.h"
+#include "gemm/find_default_mma.h"
+#include "gemm/mma_from_smem.h"
+#include "gemm_kernel_utils.h"
+#include "transform/bias_broadcast.h"
+#include "transform/tile_smem_loader.h"
+
+#include <inttypes.h>
+
+using namespace gemm_kernel_utils;
+
+namespace {
+template <typename scalar_t, typename Arch>
+constexpr int getWarpsPerSm()
+{
+    return (Arch::kMinComputeCapability >= 80 && !cutlass::platform::is_same<scalar_t, float>::value
+                ? 16
+                : 12);
+}
+static CUTLASS_DEVICE float atomicMaxFloat(float* addr, float value)
+{
+    // source: https://stackoverflow.com/a/51549250
+    return (value >= 0) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
+                        : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+}
+}  // namespace
+
+template <
+    // The datatype of Q/K/V
+    typename scalar_t_,
+    // Architecture we are targeting (eg `cutlass::arch::Sm80`)
+    typename ArchTag,
+    // If Q/K/V are correctly aligned in memory and we can run a fast kernel
+    bool isAligned_,
+    int kQueriesPerBlock,
+    int kKeysPerBlock_,
+    bool kSingleValueIteration_,  // = `value.shape[-1] <= kKeysPerBlock`
+    // This is quite slower on V100 for some reason
+    // Set to false if you know at compile-time you will never need dropout
+    bool kSupportsBias_ = false,
+    template <typename, typename, typename> class Broadcast1_ = BroadcastNoLoad,
+    template <typename, typename, typename> class Broadcast2_ = BroadcastNoLoad>
+struct AttentionKernel {
+    using scalar_t = scalar_t_;
+    using accum_t = float;
+    using lse_scalar_t = float;
+    using output_t = scalar_t;
+    // Accumulator between 2 iterations
+    // Using `accum_t` improves perf on f16 at the cost of
+    // numerical errors
+    using output_accum_t = accum_t;
+    static constexpr bool kSupportsBias = kSupportsBias_;
+    static constexpr int kKeysPerBlock = kKeysPerBlock_;
+    static constexpr bool kIsAligned = isAligned_;
+    static constexpr bool kSingleValueIteration = kSingleValueIteration_;
+    static constexpr int32_t kAlignLSE = 32;  // block size of backward
+    static constexpr bool kPreloadV =
+        ArchTag::kMinComputeCapability >= 80 && cutlass::sizeof_bits<scalar_t>::value == 16;
+    static constexpr bool kKeepOutputInRF = kSingleValueIteration;
+    static constexpr bool kNeedsOutputAccumulatorBuffer =
+        !kKeepOutputInRF && !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+    static_assert(kQueriesPerBlock % 32 == 0, "");
+    static_assert(kKeysPerBlock % 32 == 0, "");
+    static constexpr int kNumWarpsPerBlock = kQueriesPerBlock * kKeysPerBlock / (32 * 32);
+    static constexpr int kWarpSize = 32;
+
+    // Launch bounds
+    static constexpr int kNumThreads = kWarpSize * kNumWarpsPerBlock;
+    static constexpr int kMinBlocksPerSm = getWarpsPerSm<scalar_t, ArchTag>() / kNumWarpsPerBlock;
+
+    struct Params {
+        // Input tensors
+        scalar_t* query_ptr;  // [num_queries, num_heads, head_dim]
+        scalar_t* key_ptr;    // [num_keys, num_heads, head_dim]
+        scalar_t* value_ptr;  // [num_keys, num_heads, head_dim_value]
+
+        // Output tensors
+        output_t* output_ptr;              // [num_queries, num_heads, head_dim_value]
+        output_accum_t* output_accum_ptr;  // [num_queries, num_heads, head_dim_value]
+        lse_scalar_t* logsumexp_ptr;       // [num_heads, num_queries] - can be null
+
+        // Scale
+        accum_t scale;
+
+        // Dimensions/strides
+        int32_t head_dim;
+        int32_t head_dim_value;
+        int32_t num_queries;
+        int32_t num_keys;
+
+        int32_t q_strideM;
+        int32_t k_strideM;
+        int32_t v_strideM;
+        // int32_t bias_strideM = 0;
+
+        int32_t o_strideM = 0;
+
+        // Everything below is only used in `advance_to_block`
+        // and shouldn't use registers
+        int32_t q_strideH;
+        int32_t k_strideH;
+        int32_t v_strideH;
+        // int32_t bias_strideH = 0;
+
+        int64_t q_strideB;
+        int64_t k_strideB;
+        int64_t v_strideB;
+        // int32_t bias_strideB = 0;
+
+        int32_t num_batches;
+        int32_t num_heads;
+
+        // Parameters for biases
+        scalar_t* bias1_ptr = nullptr;
+        scalar_t* bias2_ptr = nullptr;
+        int32_t B = 0;
+        int32_t N = 0;
+
+        // Moves pointers to what we should process
+        // Returns "false" if there is no work to do
+        CUTLASS_DEVICE bool advance_to_block()
+        {
+            auto batch_id = blockIdx.z;
+            auto head_id = blockIdx.y;
+            auto query_start = blockIdx.x * kQueriesPerBlock;
+
+            auto lse_dim = ceil_div((int32_t)num_queries, kAlignLSE) * kAlignLSE;
+
+            query_ptr += batch_id * q_strideB;
+            key_ptr += batch_id * k_strideB;
+            value_ptr += batch_id * v_strideB;
+            output_ptr += int64_t(batch_id * num_queries) * o_strideM;
+            if (output_accum_ptr != nullptr) {
+                output_accum_ptr += int64_t(batch_id * num_queries) * (head_dim_value * num_heads);
+            }
+
+            int64_t q_start = 0, k_start = 0;
+            // Advance to the current batch / head / query_start
+            query_ptr += (q_start + query_start) * q_strideM + head_id * q_strideH;
+            key_ptr += k_start * k_strideM + head_id * k_strideH;
+
+            value_ptr += k_start * v_strideM + head_id * v_strideH;
+            output_ptr += int64_t(q_start + query_start) * o_strideM + head_id * head_dim_value;
+
+            if (output_accum_ptr != nullptr) {
+                output_accum_ptr += int64_t(q_start + query_start) * (head_dim_value * num_heads) +
+                                    head_id * head_dim_value;
+            } else {
+                // Accumulate directly in the destination buffer (eg for f32)
+                output_accum_ptr = (accum_t*)output_ptr;
+            }
+
+            if (logsumexp_ptr != nullptr) {
+                // lse[batch_id, head_id, query_start]
+                logsumexp_ptr += batch_id * lse_dim * num_heads + head_id * lse_dim + query_start;
+            }
+
+            using broadcast_1 = Broadcast1_<typename MM0::BiasLoader::ThreadMap,
+                                            typename MM0::BiasLoader::Shape,
+                                            scalar_t>;
+            if (kSupportsBias && broadcast_1::kEnable && bias1_ptr) {
+                bias1_ptr = broadcast_1::advance(bias1_ptr,
+                                                 batch_id / N,
+                                                 batch_id % N,
+                                                 head_id,
+                                                 num_queries * N,
+                                                 num_queries,
+                                                 0);
+            }
+            using broadcast_2 = Broadcast2_<typename MM0::BiasLoader::ThreadMap,
+                                            typename MM0::BiasLoader::Shape,
+                                            scalar_t>;
+            if (kSupportsBias && broadcast_2::kEnable && bias2_ptr) {
+                auto strideB = num_heads * num_queries * num_keys;
+                auto strideH = num_queries * num_keys;
+                bias2_ptr = broadcast_2::advance(
+                    bias2_ptr, batch_id / N, batch_id % N, head_id, strideB, 0, strideH);
+            }
+
+            num_queries -= query_start;
+            num_batches = 0;  // no longer used after
+
+            // If num_queries == 1, and there is only one key head we're wasting
+            // 15/16th of tensor core compute In that case :
+            //  - we only launch kernels for head_id % kQueriesPerBlock == 0
+            //  - we iterate over heads instead of queries (strideM = strideH)
+            if (num_queries == 1 && k_strideH == 0 && v_strideH == 0) {
+                if (head_id % kQueriesPerBlock != 0) return false;
+                q_strideM = q_strideH;
+                num_queries = num_heads;
+                num_heads = 1;  // unused but here for intent
+                o_strideM = head_dim_value;
+            }
+
+            // Make sure the compiler knows these variables are the same on all
+            // the threads of the warp.
+            query_ptr = warp_uniform(query_ptr);
+            key_ptr = warp_uniform(key_ptr);
+            value_ptr = warp_uniform(value_ptr);
+            output_ptr = warp_uniform(output_ptr);
+            output_accum_ptr = warp_uniform(output_accum_ptr);
+            logsumexp_ptr = warp_uniform(logsumexp_ptr);
+            num_queries = warp_uniform(num_queries);
+            num_keys = warp_uniform(num_keys);
+            num_heads = warp_uniform(num_heads);
+            head_dim = warp_uniform(head_dim);
+            head_dim_value = warp_uniform(head_dim_value);
+            o_strideM = warp_uniform(o_strideM);
+            if (kSupportsBias && broadcast_1::kEnable) { bias1_ptr = warp_uniform(bias1_ptr); }
+            if (kSupportsBias && broadcast_2::kEnable) { bias2_ptr = warp_uniform(bias2_ptr); }
+            return true;
+        }
+
+        __host__ dim3 getBlocksGrid() const
+        {
+            return dim3(ceil_div(num_queries, (int32_t)kQueriesPerBlock), num_heads, num_batches);
+        }
+
+        __host__ dim3 getThreadsGrid() const { return dim3(kWarpSize, kNumWarpsPerBlock, 1); }
+    };
+
+    struct MM0 {
+        /*
+          In this first matmul, we compute a block of `Q @ K.T`.
+          While the calculation result is still hot in registers, we update
+          `mi`, `m_prime`, `s_prime` in shared-memory, and then store this value
+          into a shared-memory ("AccumulatorSharedStorage") that is used later as
+          operand A for the second matmul (see MM1)
+        */
+        using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+        using OpClass = typename GemmType::OpClass;
+        using DefaultConfig =
+            typename cutlass::gemm::device::DefaultGemmConfiguration<OpClass,
+                                                                     ArchTag,
+                                                                     scalar_t,
+                                                                     scalar_t,
+                                                                     scalar_t,  // ElementC
+                                                                     accum_t  // ElementAccumulator
+                                                                     >;
+        static constexpr int kAlignmentA = kIsAligned ? DefaultConfig::kAlignmentA
+                                                      : GemmType::kMinimumAlignment;
+        static constexpr int kAlignmentB = kIsAligned ? DefaultConfig::kAlignmentB
+                                                      : GemmType::kMinimumAlignment;
+        using ThreadblockShape =
+            cutlass::gemm::GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+        using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+        using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma<
+            scalar_t,                   // ElementA,
+            cutlass::layout::RowMajor,  // LayoutA,
+            kAlignmentA,
+            scalar_t,                      // ElementB,
+            cutlass::layout::ColumnMajor,  // LayoutB,
+            kAlignmentB,
+            accum_t,
+            cutlass::layout::RowMajor,  // LayoutC,
+            OpClass,
+            ArchTag,                              // ArchTag
+            ThreadblockShape,                     // ThreadblockShape
+            WarpShape,                            // WarpShape
+            typename GemmType::InstructionShape,  // InstructionShape
+            DefaultConfig::kStages,               // Should use `DefaultConfig::kStages`, but that
+                                                  // uses too much smem
+            typename GemmType::Operator           // Operator
+            >::DefaultMma;
+        using MmaCore = typename DefaultMma::MmaCore;
+        using IteratorA = typename DefaultMma::IteratorA;
+        using IteratorB = typename DefaultMma::IteratorB;
+        using Mma = typename DefaultMma::ThreadblockMma;
+        using AccumLambdaIterator =
+            typename DefaultMmaAccumLambdaIterator<typename Mma::Operator::IteratorC,
+                                                   accum_t,
+                                                   kWarpSize>::Iterator;
+        static_assert(MmaCore::WarpCount::kM * MmaCore::WarpCount::kN * MmaCore::WarpCount::kK ==
+                          kNumWarpsPerBlock,
+                      "");
+
+        // used for efficient load of bias tile Bij from global to shared memory
+        using BiasLoader =
+            TileSmemLoader<scalar_t,
+                           cutlass::MatrixShape<kQueriesPerBlock, kKeysPerBlock>,
+                           MmaCore::kThreads,
+                           // input restriction: kv_len has to be a multiple of this value
+                           128 / cutlass::sizeof_bits<scalar_t>::value>;
+
+        // Epilogue to store to shared-memory in a format that we can use later for
+        // the second matmul
+        using B2bGemm =
+            typename cutlass::gemm::threadblock::B2bGemm<typename Mma::Operator::IteratorC,
+                                                         typename Mma::Operator,
+                                                         scalar_t,
+                                                         WarpShape,
+                                                         ThreadblockShape>;
+        using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+    };
+
+    struct MM1 {
+        /**
+          Second matmul: perform `attn @ V` where `attn` is the attention (not
+          normalized) and stored in shared memory
+        */
+        using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+        using OpClass = typename GemmType::OpClass;
+        using DefaultConfig =
+            typename cutlass::gemm::device::DefaultGemmConfiguration<OpClass,
+                                                                     ArchTag,
+                                                                     scalar_t,
+                                                                     scalar_t,
+                                                                     output_accum_t,  // ElementC
+                                                                     accum_t  // ElementAccumulator
+                                                                     >;
+        static constexpr int kAlignmentA = DefaultConfig::kAlignmentA;  // from smem
+        static constexpr int kAlignmentB = kIsAligned ? DefaultConfig::kAlignmentB
+                                                      : GemmType::kMinimumAlignment;
+        using ThreadblockShape =
+            cutlass::gemm::GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+        using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+        using InstructionShape = typename GemmType::InstructionShape;
+
+        using LayoutB = cutlass::layout::RowMajor;
+        using DefaultGemm =
+            cutlass::gemm::kernel::DefaultGemm<scalar_t,                   // ElementA,
+                                               cutlass::layout::RowMajor,  // LayoutA,
+                                               kAlignmentA,
+                                               scalar_t,  // ElementB,
+                                               LayoutB,   // LayoutB,
+                                               kAlignmentB,
+                                               output_accum_t,
+                                               cutlass::layout::RowMajor,  // LayoutC,
+                                               accum_t,
+                                               OpClass,
+                                               ArchTag,
+                                               ThreadblockShape,
+                                               WarpShape,
+                                               typename GemmType::InstructionShape,
+                                               typename DefaultConfig::EpilogueOutputOp,
+                                               void,  // ThreadblockSwizzle - not used
+                                               DefaultConfig::kStages,
+                                               false,  // SplitKSerial
+                                               typename GemmType::Operator>;
+
+        using DefaultMmaFromSmem = typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            typename MM0::AccumulatorSharedStorage,
+            false>;  // kScaleOperandA
+        using Mma = typename DefaultMmaFromSmem::Mma;
+        using IteratorB = typename Mma::IteratorB;
+        using WarpCount = typename Mma::WarpCount;
+        static_assert(WarpCount::kM * WarpCount::kN * WarpCount::kK == kNumWarpsPerBlock, "");
+
+        using DefaultEpilogue = typename DefaultGemm::Epilogue;
+        using OutputTileIterator = typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_t>;
+        using OutputTileIteratorAccum =
+            typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+                typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+                output_accum_t>;
+
+        struct SharedStorageMM1 {
+            typename Mma::SharedStorage mm;
+        };
+    };
+
+    static constexpr int64_t kAlignmentQ = MM0::kAlignmentA;
+    static constexpr int64_t kAlignmentK = MM0::kAlignmentB;
+    static constexpr int64_t kAlignmentV = 1;
+
+    // Shared storage - depends on kernel params
+    struct ScalingCoefs {
+        cutlass::Array<accum_t, kQueriesPerBlock> m_prime;
+        cutlass::Array<accum_t, kQueriesPerBlock> s_prime;
+        cutlass::Array<accum_t, kQueriesPerBlock> mi;
+    };
+
+    struct SharedStorageEpilogueAtEnd : ScalingCoefs {
+        struct SharedStorageAfterMM0 {
+            // Everything here might be overwritten during MM0
+            union {
+                // typename MM0::BiasLoader::SmemTile bias;
+                cutlass::AlignedBuffer<float, MM0::BiasLoader::Shape::kCount> bias;
+                typename MM0::AccumulatorSharedStorage si;
+            };
+            typename MM1::SharedStorageMM1 mm1;
+        };
+
+        union {
+            typename MM0::Mma::SharedStorage mm0;
+            SharedStorageAfterMM0 after_mm0;
+            typename MM1::DefaultEpilogue::SharedStorage epilogue;
+        };
+
+        CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage& epilogue_shared_storage()
+        {
+            return epilogue;
+        }
+    };
+
+    struct SharedStorageEpilogueInLoop : ScalingCoefs {
+        struct SharedStorageAfterMM0 {
+            // Everything here might be overwritten during MM0
+            union {
+                // typename MM0::BiasLoader::SmemTile bias;
+                cutlass::AlignedBuffer<float, MM0::BiasLoader::Shape::kCount> bias;
+                typename MM0::AccumulatorSharedStorage si;
+            };
+            typename MM1::SharedStorageMM1 mm1;
+            typename MM1::DefaultEpilogue::SharedStorage epilogue;
+        };
+
+        union {
+            typename MM0::Mma::SharedStorage mm0;
+            SharedStorageAfterMM0 after_mm0;
+        };
+
+        CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage& epilogue_shared_storage()
+        {
+            return after_mm0.epilogue;
+        }
+    };
+
+    using SharedStorage =
+        typename cutlass::platform::conditional<kSingleValueIteration || kKeepOutputInRF,
+                                                SharedStorageEpilogueAtEnd,
+                                                SharedStorageEpilogueInLoop>::type;
+
+    static bool __host__ check_supported(Params const& p)
+    {
+        CHECK_ALIGNED_PTR(p.query_ptr, kAlignmentQ);
+        CHECK_ALIGNED_PTR(p.key_ptr, kAlignmentK);
+        CHECK_ALIGNED_PTR(p.value_ptr, kAlignmentV);
+        EVOFORMER_CHECK(p.q_strideM % kAlignmentQ == 0, "query is not correctly aligned (strideM)");
+        EVOFORMER_CHECK(p.k_strideM % kAlignmentK == 0, "key is not correctly aligned (strideM)");
+        EVOFORMER_CHECK(p.v_strideM % kAlignmentV == 0, "value is not correctly aligned (strideM)");
+        EVOFORMER_CHECK(p.num_heads <= 1 || p.q_strideH % kAlignmentQ == 0,
+                        "query is not correctly aligned (strideH)");
+        EVOFORMER_CHECK(p.num_heads <= 1 || p.k_strideH % kAlignmentK == 0,
+                        "key is not correctly aligned (strideH)");
+        EVOFORMER_CHECK(p.num_heads <= 1 || p.v_strideH % kAlignmentV == 0,
+                        "value is not correctly aligned (strideH)");
+        return true;
+    }
+
+    static void CUTLASS_DEVICE attention_kernel(Params& p)
+    {
+        // In this block, we will only ever:
+        // - read query[query_start:query_end, :]
+        // - write to output[query_start:query_end, :]
+
+        extern __shared__ char smem_buffer[];
+        SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
+        auto& m_prime = shared_storage.m_prime;
+        auto& s_prime = shared_storage.s_prime;
+        auto& mi = shared_storage.mi;
+        const uint32_t query_start = blockIdx.x * kQueriesPerBlock;
+
+        static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
+        if (thread_id() < kQueriesPerBlock) {
+            s_prime[thread_id()] = accum_t(0);
+            m_prime[thread_id()] = -cutlass::platform::numeric_limits<accum_t>::infinity();
+            mi[thread_id()] = -cutlass::platform::numeric_limits<accum_t>::infinity();
+        }
+        typename MM1::Mma::FragmentC accum_o;
+        accum_o.clear();
+
+        auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator {
+            using OutputTileIterator = typename MM1::OutputTileIterator;
+            return OutputTileIterator(
+                typename OutputTileIterator::Params{(int32_t)p.o_strideM},
+                p.output_ptr,
+                typename OutputTileIterator::TensorCoord{p.num_queries, p.head_dim_value},
+                thread_id(),
+                {0, col});
+        };
+
+        auto createOutputAccumIter = [&](int col) -> typename MM1::OutputTileIteratorAccum {
+            using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum;
+            return OutputTileIteratorAccum(
+                typename OutputTileIteratorAccum::Params{(int32_t)(p.head_dim_value * p.num_heads)},
+                p.output_accum_ptr,
+                typename OutputTileIteratorAccum::TensorCoord{p.num_queries, p.head_dim_value},
+                thread_id(),
+                {0, col});
+        };
+
+        // Iterate through keys
+        for (int32_t iter_key_start = 0; iter_key_start < p.num_keys;
+             iter_key_start += kKeysPerBlock) {
+            int32_t problem_size_0_m = cutlass::fast_min((int32_t)kQueriesPerBlock, p.num_queries);
+            int32_t problem_size_0_n =
+                cutlass::fast_min(int32_t(kKeysPerBlock), p.num_keys - iter_key_start);
+            int32_t const& problem_size_0_k = p.head_dim;
+            int32_t const& problem_size_1_n = p.head_dim_value;
+            int32_t const& problem_size_1_k = problem_size_0_n;
+
+            auto prologueV = [&](int blockN) {
+                typename MM1::Mma::IteratorB iterator_V(
+                    typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)},
+                    p.value_ptr + iter_key_start * p.v_strideM,
+                    {problem_size_1_k, problem_size_1_n},
+                    thread_id(),
+                    cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+                MM1::Mma::prologue(
+                    shared_storage.after_mm0.mm1.mm, iterator_V, thread_id(), problem_size_1_k);
+            };
+
+            __syncthreads();  // Need to have shared memory initialized, and `m_prime`
+                              // updated from end of prev iter
+            //
+            // MATMUL: Q.K_t
+            //
+            // Computes the block-matrix product of:
+            // (a) query[query_start:query_end, :]
+            // with
+            // (b) key[iter_key_start:iter_key_start + kKeysPerBlock]
+            // and stores that into `shared_storage.si`
+            //
+
+            // Compute threadblock location
+            cutlass::gemm::GemmCoord tb_tile_offset = {0, 0, 0};
+
+            cutlass::MatrixCoord tb_offset_A{tb_tile_offset.m() * MM0::Mma::Shape::kM,
+                                             tb_tile_offset.k()};
+
+            cutlass::MatrixCoord tb_offset_B{tb_tile_offset.k(),
+                                             tb_tile_offset.n() * MM0::Mma::Shape::kN};
+
+            // Construct iterators to A and B operands
+            typename MM0::IteratorA iterator_A(
+                typename MM0::IteratorA::Params(typename MM0::MmaCore::LayoutA(p.q_strideM)),
+                p.query_ptr,
+                {problem_size_0_m, problem_size_0_k},
+                thread_id(),
+                tb_offset_A);
+
+            typename MM0::IteratorB iterator_B(
+                typename MM0::IteratorB::Params(typename MM0::MmaCore::LayoutB(p.k_strideM)),
+                p.key_ptr + iter_key_start * p.k_strideM,
+                {problem_size_0_k, problem_size_0_n},
+                thread_id(),
+                tb_offset_B);
+
+            auto my_warp_id = warp_id();
+            auto my_lane_id = lane_id();
+
+            // Construct thread-scoped matrix multiply
+            typename MM0::Mma mma(shared_storage.mm0, thread_id(), my_warp_id, my_lane_id);
+
+            typename MM0::Mma::FragmentC accum;
+
+            accum.clear();
+
+            auto gemm_k_iterations =
+                (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK;
+
+            // Compute threadblock-scoped matrix multiply-add
+            mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+            __syncthreads();
+
+            if (kPreloadV) {
+                prologueV(0);
+            } else {
+                MM1::Mma::drain_cp_asyncs();
+            }
+
+            typename MM0::Mma::Operator::IteratorC::TensorCoord iteratorC_tile_offset = {
+                (tb_tile_offset.m() * MM0::Mma::WarpCount::kM) +
+                    (my_warp_id % MM0::Mma::WarpCount::kM),
+                (tb_tile_offset.n() * MM0::Mma::WarpCount::kN) +
+                    (my_warp_id / MM0::Mma::WarpCount::kM)};
+
+            // multiply by scaling factor
+            // if (kSupportsBias) {
+            //   accum =
+            //       cutlass::multiplies<typename MM0::Mma::FragmentC>()(p.scale,
+            //       accum);
+            // }
+
+            if (kSupportsBias) {
+                cutlass::TensorRef<float, cutlass::layout::RowMajor> bias_tensor_ref(
+                    shared_storage.after_mm0.bias.data(),
+                    cutlass::layout::RowMajor(MM0::ThreadblockShape::kN));
+                using Shape =
+                    cutlass::MatrixShape<MM0::ThreadblockShape::kM, MM0::ThreadblockShape::kN>;
+                AttentionBiasEpilogue<Shape,
+                                      scalar_t,
+                                      MM0::MmaCore::kThreads,
+                                      Broadcast1_,
+                                      Broadcast2_>
+                    bias_epilogue;
+                bias_epilogue(bias_tensor_ref,
+                              p.bias1_ptr + iter_key_start,
+                              p.bias2_ptr + query_start * p.num_keys + iter_key_start,
+                              thread_id(),
+                              {problem_size_0_m, problem_size_0_n},
+                              p.num_keys);
+                // Pij += Bij, Pij is in register fragment and Bij is in shared memory
+                auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset(
+                    lane_id(), warp_id(), iteratorC_tile_offset);
+                MM0::AccumLambdaIterator::iterateRows(
+                    lane_offset,
+                    [&](int accum_m) {},
+                    [&](int accum_m, int accum_n, int idx) {
+                        if (accum_m < problem_size_0_m && accum_n < problem_size_0_n) {
+                            accum[idx] =
+                                accum[idx] * p.scale + bias_tensor_ref.at({accum_m, accum_n});
+                        }
+                    },
+                    [&](int accum_m) {});
+            }
+
+            DISPATCH_BOOL(iter_key_start == 0, kIsFirst, ([&] {
+                              DISPATCH_BOOL(
+                                  p.num_keys - iter_key_start >= kKeysPerBlock, kFullColumns, ([&] {
+                                      // Update `mi` from accum stored in registers
+                                      // Also does accum[i] <- exp(accum[i] - mi)
+                                      iterative_softmax<typename MM0::Mma::Operator::IteratorC,
+                                                        kFullColumns,
+                                                        kIsFirst>(accum_o,
+                                                                  accum,
+                                                                  mi,
+                                                                  m_prime,
+                                                                  s_prime,
+                                                                  lane_id(),
+                                                                  thread_id(),
+                                                                  warp_id(),
+                                                                  p.num_keys - iter_key_start,
+                                                                  iteratorC_tile_offset,
+                                                                  kSupportsBias ? 1.0f : p.scale);
+                                  }));
+                          }));
+
+            // Output results to shared-memory
+            int warp_idx_mn_0 =
+                my_warp_id % (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN);
+            auto output_tile_coords =
+                cutlass::MatrixCoord{warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM,
+                                     warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM};
+
+            MM0::B2bGemm::accumToSmem(
+                shared_storage.after_mm0.si, accum, my_lane_id, output_tile_coords);
+
+            __syncthreads();
+
+            //
+            // MATMUL: Attn . V
+            // Run the matmul `attn @ V` for a block of attn and V.
+            // `attn` is read from shared memory (in `shared_storage_si`)
+            // `V` is read from global memory (with iterator_B)
+            //
+
+            const int64_t nBlockN =
+                kSingleValueIteration
+                    ? 1
+                    : ceil_div((int64_t)problem_size_1_n, int64_t(MM1::ThreadblockShape::kN));
+            for (int blockN = 0; blockN < nBlockN; ++blockN) {
+                int gemm_k_iterations =
+                    (problem_size_1_k + MM1::Mma::Shape::kK - 1) / MM1::Mma::Shape::kK;
+
+                // Compute threadblock-scoped matrix multiply-add and store it in accum
+                // (in registers)
+                if (!kPreloadV) {
+                    __syncthreads();  // we share shmem between mma and epilogue
+                }
+
+                typename MM1::Mma::IteratorB iterator_V(
+                    typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)},
+                    p.value_ptr + iter_key_start * p.v_strideM,
+                    {problem_size_1_k, problem_size_1_n},
+                    thread_id(),
+                    cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+                typename MM1::Mma mma_pv(shared_storage.after_mm0.mm1.mm,
+                                         shared_storage.after_mm0.si,
+                                         (int)thread_id(),
+                                         (int)warp_id(),
+                                         (int)lane_id(),
+                                         (int)problem_size_1_k);
+                mma_pv.set_prologue_done(kPreloadV);
+                if (!kKeepOutputInRF) { accum_o.clear(); }
+                mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o);
+                __syncthreads();
+
+                if (kPreloadV && !kSingleValueIteration && blockN + 1 < nBlockN) {
+                    prologueV(blockN + 1);
+                }
+
+                if (!kKeepOutputInRF) {
+                    MM1::Mma::drain_cp_asyncs();
+                    DISPATCH_BOOL(
+                        iter_key_start == 0, kIsFirst, ([&] {
+                            DISPATCH_BOOL(
+                                (iter_key_start + kKeysPerBlock) >= p.num_keys, kIsLast, ([&] {
+                                    using DefaultEpilogue = typename MM1::DefaultEpilogue;
+                                    using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+                                    using ElementCompute = typename DefaultOp::ElementCompute;
+                                    using EpilogueOutputOp = typename cutlass::epilogue::thread::
+                                        MemoryEfficientAttentionNormalize<
+                                            typename cutlass::platform::
+                                                conditional<kIsLast, output_t, output_accum_t>::
+                                                    type,
+                                            output_accum_t,
+                                            DefaultOp::kCount,
+                                            typename DefaultOp::ElementAccumulator,
+                                            ElementCompute,
+                                            kIsFirst,
+                                            kIsLast,
+                                            cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+                                    using Epilogue =
+                                        typename cutlass::epilogue::threadblock::EpiloguePipelined<
+                                            typename DefaultEpilogue::Shape,
+                                            typename MM1::Mma::Operator,
+                                            DefaultEpilogue::kPartitionsK,
+                                            typename cutlass::platform::conditional<
+                                                kIsLast,
+                                                typename MM1::OutputTileIterator,
+                                                typename MM1::OutputTileIteratorAccum>::type,
+                                            typename DefaultEpilogue::AccumulatorFragmentIterator,
+                                            typename DefaultEpilogue::WarpTileIterator,
+                                            typename DefaultEpilogue::SharedLoadIterator,
+                                            EpilogueOutputOp,
+                                            typename DefaultEpilogue::Padding,
+                                            DefaultEpilogue::kFragmentsPerIteration,
+                                            true,  // IterationsUnroll
+                                            typename MM1::OutputTileIteratorAccum  // Read
+                                                                                   // iterator
+                                            >;
+
+                                    int col = blockN * MM1::Mma::Shape::kN;
+                                    auto source_iter = createOutputAccumIter(col);
+                                    auto dest_iter =
+                                        call_conditional<kIsLast,
+                                                         decltype(createOutputIter),
+                                                         decltype(createOutputAccumIter)>::
+                                            apply(createOutputIter, createOutputAccumIter, col);
+                                    EpilogueOutputOp rescale(s_prime, m_prime);
+                                    Epilogue epilogue(shared_storage.epilogue_shared_storage(),
+                                                      thread_id(),
+                                                      warp_id(),
+                                                      lane_id());
+                                    epilogue(rescale, dest_iter, accum_o, source_iter);
+                                }));
+                        }));
+                    if (!kSingleValueIteration) { __syncthreads(); }
+                }
+            }
+            __syncthreads();  // we modify `m_prime` after
+        }
+
+        if (kKeepOutputInRF) {
+            constexpr bool kIsFirst = true;
+            constexpr bool kIsLast = true;
+            using DefaultEpilogue = typename MM1::DefaultEpilogue;
+            using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+            using ElementCompute = typename DefaultOp::ElementCompute;
+            using EpilogueOutputOp =
+                typename cutlass::epilogue::thread::MemoryEfficientAttentionNormalize<
+                    output_t,        // output
+                    output_accum_t,  // source
+                    DefaultOp::kCount,
+                    typename DefaultOp::ElementAccumulator,  // accum
+                    output_accum_t,                          // compute
+                    kIsFirst,
+                    kIsLast,
+                    cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+            using Epilogue = typename cutlass::epilogue::threadblock::EpiloguePipelined<
+                typename DefaultEpilogue::Shape,
+                typename MM1::Mma::Operator,
+                DefaultEpilogue::kPartitionsK,
+                typename MM1::OutputTileIterator,  // destination
+                typename DefaultEpilogue::AccumulatorFragmentIterator,
+                typename DefaultEpilogue::WarpTileIterator,
+                typename DefaultEpilogue::SharedLoadIterator,
+                EpilogueOutputOp,
+                typename DefaultEpilogue::Padding,
+                DefaultEpilogue::kFragmentsPerIteration,
+                true,                                  // IterationsUnroll
+                typename MM1::OutputTileIteratorAccum  // source tile
+                >;
+            auto dest_iter = createOutputIter(0);
+            EpilogueOutputOp rescale(s_prime, m_prime);
+            Epilogue epilogue(
+                shared_storage.epilogue_shared_storage(), thread_id(), warp_id(), lane_id());
+            MM1::Mma::drain_cp_asyncs();
+            epilogue(rescale, dest_iter, accum_o);
+        }
+
+        // 7. Calculate logsumexp
+        // To make the backward easier, we pad logsumexp with `inf`
+        // this avoids a few bound checks, and is not more expensive during fwd
+        static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
+        if (p.logsumexp_ptr && thread_id() < kQueriesPerBlock) {
+            auto lse_dim = ceil_div((int32_t)p.num_queries, kAlignLSE) * kAlignLSE;
+            if (thread_id() < p.num_queries) {
+                p.logsumexp_ptr[thread_id()] =
+                    accum_t(mi[thread_id()]) + cutlass::fast_log(accum_t(s_prime[thread_id()]));
+            } else if (thread_id() < lse_dim) {
+                p.logsumexp_ptr[thread_id()] =
+                    cutlass::platform::numeric_limits<accum_t>::infinity();
+            }
+        }
+    }
+
+    template <typename WarpIteratorC,
+              bool kFullColumns,
+              bool kIsFirst>
+    CUTLASS_DEVICE static void iterative_softmax(
+        typename WarpIteratorC::Fragment& frag_o,  // output so far
+        typename WarpIteratorC::Fragment& frag,
+        cutlass::Array<accum_t, kQueriesPerBlock>& mi,
+        cutlass::Array<accum_t, kQueriesPerBlock>& m_prime,
+        cutlass::Array<accum_t, kQueriesPerBlock>& s_prime,
+        int8_t lane_id,
+        int8_t thread_id,
+        int8_t warp_id,
+        int16_t max_col,
+        typename WarpIteratorC::TensorCoord const& tile_offset,
+        float scaling)
+    {
+        /* Iterates on the accumulator and corresponding position on result matrix
+
+        (1) Update `mi[r]` to the max value of the row `r`
+        (2) In a second iteration do the following:
+            (a) accum   <- exp(accum - mi)
+            (b) m_prime <- exp(m_prime - mi)
+            (c) s_prime <- s_prime * m_prime + sum(accum)
+
+        All of this is done on registers, before we store all of this
+        on shared memory for the next matmul with Value.
+        */
+        using Fragment = typename WarpIteratorC::Fragment;
+        using LambdaIterator =
+            typename DefaultMmaAccumLambdaIterator<WarpIteratorC, accum_t, kWarpSize>::Iterator;
+        // Convert to `accum_t` (rather than double)
+        constexpr float kLog2e = 1.4426950408889634074;  // log_2(e) = M_LOG2E
+        if (!kIsFirst) {
+            if (thread_id < kQueriesPerBlock) { m_prime[thread_id] = mi[thread_id]; }
+            __syncthreads();
+        }
+
+        auto lane_offset = LambdaIterator::get_lane_offset(lane_id, warp_id, tile_offset);
+
+        // First update `mi` to the max per-row
+        {
+            accum_t max;
+            LambdaIterator::iterateRows(
+                lane_offset,
+                [&](int accum_m) { max = -cutlass::platform::numeric_limits<accum_t>::infinity(); },
+                [&](int accum_m, int accum_n, int idx) {
+                    if (kFullColumns || accum_n < max_col) {
+                        max = cutlass::fast_max(max, frag[idx]);
+                    }
+                },
+                [&](int accum_m) {
+                    // Having 4x atomicMax seems faster than reduce within warp
+                    // first...
+                    atomicMaxFloat(&mi[accum_m], max * scaling);
+                });
+        }
+        frag = cutlass::multiplies<Fragment>()(scaling * kLog2e, frag);
+
+        // Make sure we all share the update values for `mi`
+        __syncthreads();
+
+        if (thread_id < kQueriesPerBlock) {
+            auto m_prime_exp = exp2f(kLog2e * (m_prime[thread_id] - mi[thread_id]));
+            m_prime[thread_id] = m_prime_exp;
+            s_prime[thread_id] *= m_prime_exp;
+        }
+        __syncthreads();  // Update output fragments
+        if (kKeepOutputInRF && !kIsFirst) {
+            accum_t mp;
+            LambdaIterator::iterateRows(
+                lane_offset,
+                [&](int accum_m) { mp = m_prime[accum_m]; },
+                [&](int accum_m, int accum_n, int idx) { frag_o[idx] *= mp; },
+                [&](int accum_m) {});
+            __syncthreads();
+        }
+        // Update accum_m, accum_n, ...
+        {
+            accum_t mi_row, total_row;
+            LambdaIterator::iterateRows(
+                lane_offset,
+                [&](int accum_m) { mi_row = kLog2e * mi[accum_m]; },
+                [&](int accum_m, int accum_n, int idx) {
+                    frag[idx] = (kFullColumns || accum_n < max_col) ? exp2f(frag[idx] - mi_row)
+                                                                    : accum_t(0.0);
+                },
+                [&](int accum_m) {});
+            LambdaIterator::iterateRows(
+                lane_offset,
+                [&](int accum_m) { total_row = 0.0; },
+                [&](int accum_m, int accum_n, int idx) { total_row += frag[idx]; },
+                [&](int accum_m) {
+                    if (LambdaIterator::reduceSameRow(
+                            lane_id, total_row, [](accum_t a, accum_t b) { return a + b; })) {
+                        atomicAdd(&s_prime[accum_m], total_row);
+                    }
+                });
+        }
+    }
+
+    static CUTLASS_DEVICE int8_t lane_id() { return threadIdx.x; }
+    static CUTLASS_DEVICE int8_t warp_id() { return threadIdx.y; }
+    static CUTLASS_DEVICE int16_t thread_id() { return threadIdx.x + threadIdx.y * blockDim.x; }
+};
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_batched_impl(typename AK::Params p)
+{
+    if (!p.advance_to_block()) { return; }
+    AK::attention_kernel(p);
+}
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_batched(typename AK::Params params);
diff --git a/csrc/deepspeed4science/evoformer_attn/transform/bias_broadcast.h b/csrc/deepspeed4science/evoformer_attn/transform/bias_broadcast.h
new file mode 100644
index 000000000000..0f15a43574cf
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/transform/bias_broadcast.h
@@ -0,0 +1,148 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+// This does nothing.
+template <typename ThreadMap, typename Shape, typename scalar_t>
+struct BroadcastNoLoad {
+    using Fragment =
+        cutlass::Array<scalar_t, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+    static const bool kEnable = false;
+    CUTLASS_DEVICE static void load(Fragment& frag,
+                                    scalar_t* ptr,
+                                    int thread_id,
+                                    const cutlass::MatrixCoord& extent,
+                                    int stride)
+    {
+    }
+    CUTLASS_DEVICE static scalar_t*
+    advance(scalar_t* ptr, int B_id, int N_id, int H_id, int strideB, int strideN, int strideH)
+    {
+        return ptr;
+    }
+};
+
+// This is to load the bias matrix from the global memory with on-the-fly
+// broadcast. The shape in global memory is [B, N, 1, 1, L]. Each time we load
+// the last dimension as a L row vector, and we further broadcast the L vector
+// to a tile of size [L, L] by repeating the L vector L times
+template <typename ThreadMap, typename Shape, typename scalar_t>
+struct BroadcastA : public BroadcastNoLoad<ThreadMap, Shape, scalar_t> {
+    using Base = BroadcastNoLoad<ThreadMap, Shape, scalar_t>;
+    static const bool kEnable = true;
+    using layout = cutlass::layout::AffineRank2RowMajor;
+
+    using GmemTileIterator = cutlass::transform::threadblock::
+        PredicatedTileIterator<Shape, scalar_t, layout, 0, ThreadMap>;
+    using Fragment = typename GmemTileIterator::Fragment;
+
+    CUTLASS_DEVICE static void load(Fragment& frag,
+                                    scalar_t* ptr,
+                                    int thread_id,
+                                    const cutlass::MatrixCoord& extent,
+                                    int stride)
+    {
+        GmemTileIterator iter({layout(0, 1)}, ptr, extent, thread_id);
+        iter.load(frag);
+    }
+
+    CUTLASS_DEVICE static scalar_t*
+    advance(scalar_t* ptr, int B_id, int N_id, int H_id, int strideB, int strideN, int strideH)
+    {
+        return ptr + B_id * strideB + N_id * strideN;
+    }
+};
+
+// This is to load the bias matrix from the global memory with on-the-fly
+// broadcast. The shape in global memory is [B, 1, H, L, L]. Each time we load
+// a [L, L] matrix. Different N use the same bias matrix when B and H are the
+// same.
+template <typename ThreadMap, typename Shape, typename scalar_t>
+struct BroadcastB : public BroadcastNoLoad<ThreadMap, Shape, scalar_t> {
+    using Base = BroadcastNoLoad<ThreadMap, Shape, scalar_t>;
+    static const bool kEnable = true;
+    using layout = cutlass::layout::RowMajor;
+
+    using GmemTileIterator = cutlass::transform::threadblock::
+        PredicatedTileIterator<Shape, scalar_t, layout, 0, ThreadMap>;
+    using Fragment = typename GmemTileIterator::Fragment;
+
+    CUTLASS_DEVICE static void load(Fragment& frag,
+                                    scalar_t* ptr,
+                                    int thread_id,
+                                    const cutlass::MatrixCoord& extent,
+                                    int stride)
+    {
+        GmemTileIterator iter({layout(stride)}, ptr, extent, thread_id);
+        iter.load(frag);
+    }
+
+    CUTLASS_DEVICE static scalar_t*
+    advance(scalar_t* ptr, int B_id, int N_id, int H_id, int strideB, int strideN, int strideH)
+    {
+        return ptr + B_id * strideB + H_id * strideH;
+    }
+};
+
+template <typename Shape,
+          typename scalar_t,
+          int kThreads,
+          template <typename, typename, typename>
+          class Broadcast1_,
+          template <typename, typename, typename>
+          class Broadcast2_>
+struct AttentionBiasEpilogue {
+    using ThreadMap = cutlass::transform::PitchLinearStripminedThreadMap<
+        cutlass::layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+        kThreads,
+        1>;
+
+    using Broadcast1 = Broadcast1_<ThreadMap, Shape, scalar_t>;
+    using Broadcast2 = Broadcast2_<ThreadMap, Shape, scalar_t>;
+
+    Broadcast1 broadcast1;
+    Broadcast2 broadcast2;
+
+    using Ref = cutlass::TensorRef<float, cutlass::layout::RowMajor>;
+    using SmemTileIterator = cutlass::transform::threadblock::
+        RegularTileIterator<Shape, float, cutlass::layout::RowMajor, 0, ThreadMap>;
+
+    CUTLASS_DEVICE void operator()(const Ref& ref,
+                                   scalar_t* ptr1,
+                                   scalar_t* ptr2,
+                                   int thread_id,
+                                   const cutlass::MatrixCoord& extent,
+                                   int stride)
+    {
+        static_assert(Broadcast1::Fragment::kElements == Broadcast2::Fragment::kElements,
+                      "The two broadcast fragments must have the same number of "
+                      "elements");
+        typename SmemTileIterator::Fragment frag;
+        frag.clear();
+        float* frag_ptr = reinterpret_cast<float*>(&frag);
+        if (Broadcast1::kEnable) {
+            typename Broadcast1::Fragment frag1;
+            frag1.clear();
+            broadcast1.load(frag1, ptr1, thread_id, extent, stride);
+            scalar_t* frag1_ptr = reinterpret_cast<scalar_t*>(&frag1);
+            for (int i = 0; i < Broadcast1::Fragment::kElements; ++i) {
+                frag_ptr[i] += static_cast<float>(frag1_ptr[i]);
+            }
+        }
+        if (Broadcast2::kEnable) {
+            typename Broadcast2::Fragment frag2;
+            frag2.clear();
+            broadcast2.load(frag2, ptr2, thread_id, extent, stride);
+            scalar_t* frag2_ptr = reinterpret_cast<scalar_t*>(&frag2);
+            for (int i = 0; i < Broadcast2::Fragment::kElements; ++i) {
+                frag_ptr[i] += static_cast<float>(frag2_ptr[i]);
+            }
+        }
+        SmemTileIterator iter(ref, thread_id);
+        iter.store(frag);
+        __syncthreads();
+    }
+};
diff --git a/csrc/deepspeed4science/evoformer_attn/transform/tile_smem_loader.h b/csrc/deepspeed4science/evoformer_attn/transform/tile_smem_loader.h
new file mode 100644
index 000000000000..5f3f0ee507e5
--- /dev/null
+++ b/csrc/deepspeed4science/evoformer_attn/transform/tile_smem_loader.h
@@ -0,0 +1,93 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+#include <cutlass/cutlass.h>
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+template <typename scalar_t,              // scalar type
+          typename ThreadblockTileShape,  // size of tile to load
+          int Threads,                    // number of participating threads
+          int ElementsPerAccess>          // thread access width in elements
+class TileSmemLoader {
+public:
+    using Shape = ThreadblockTileShape;
+    using SmemTile = cutlass::AlignedBuffer<scalar_t, ThreadblockTileShape::kCount>;
+
+    using ThreadMap = cutlass::transform::PitchLinearStripminedThreadMap<
+        cutlass::layout::PitchLinearShape<ThreadblockTileShape::kColumn,  // contiguous
+                                          ThreadblockTileShape::kRow>,    // strided
+        Threads,                                                          // Threads
+        ElementsPerAccess>;                                               // ElementsPerAccess
+
+    using GmemTileIterator = cutlass::transform::threadblock::PredicatedTileIterator<
+        ThreadblockTileShape,       // Shape
+        scalar_t,                   // Element
+        cutlass::layout::RowMajor,  // Layout
+        0,                          // AdvanceRank
+        ThreadMap>;                 // ThreadMap
+
+    using SmemTileIterator =
+        cutlass::transform::threadblock::RegularTileIterator<ThreadblockTileShape,       // Shape
+                                                             scalar_t,                   // Element
+                                                             cutlass::layout::RowMajor,  // Layout
+                                                             0,           // AdvanceRank
+                                                             ThreadMap>;  // ThreadMap
+
+    using Fragment = typename GmemTileIterator::Fragment;
+
+    /// load a tile from global memory into shared memory
+    CUTLASS_DEVICE
+    static void load(GmemTileIterator tile_load_iter, SmemTileIterator tile_store_iter)
+    {
+        Fragment tb_frag;
+        tb_frag.clear();
+        tile_load_iter.load(tb_frag);
+        tile_store_iter.store(tb_frag);
+
+        __syncthreads();
+    }
+};
diff --git a/csrc/transformer/inference/csrc/dequantize.cu b/csrc/transformer/inference/csrc/dequantize.cu
index 751a14440ae8..7a8e7ca446b0 100644
--- a/csrc/transformer/inference/csrc/dequantize.cu
+++ b/csrc/transformer/inference/csrc/dequantize.cu
@@ -130,16 +130,16 @@ void launch_dequantize(T* output,
 {
     unsigned threads = 1024;
     hidden_dim /= 4;
-    unsigned hid_cnt = threads / hidden_dim;
     unsigned thd_cnt = (hidden_dim - 1) / threads + 1;
-    hid_cnt = hid_cnt > 0 ? hid_cnt : 1;
 
-    unsigned blocks = (output_size + hid_cnt * groups - 1) / (hid_cnt * groups);
+    assert(output_size % groups == 0);
+    unsigned blocks = output_size / groups;
+
     dim3 block_dims(threads);
     dim3 grid_dims(groups, blocks);
 
     dequantize_kernel<<<grid_dims, block_dims, 0, stream>>>(
-        output, input, qscale, hidden_dim, hid_cnt * hidden_dim, thd_cnt);
+        output, input, qscale, hidden_dim, hidden_dim, thd_cnt);
 }
 
 #define INSTANTIATE_DEQUANTIZE_NO_MERGE(T) \
diff --git a/csrc/transformer/inference/csrc/pt_binding.cpp b/csrc/transformer/inference/csrc/pt_binding.cpp
index 97db77bff8a2..634b6e3adbbb 100644
--- a/csrc/transformer/inference/csrc/pt_binding.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding.cpp
@@ -437,6 +437,7 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                                            bool rotate_half,
                                            bool rotate_every_two,
                                            int heads,
+                                           int num_kv,
                                            float norm_factor,
                                            bool triangular,
                                            bool local_attention,
@@ -448,14 +449,14 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
 {
     unsigned bsz = query_key_value.size(0);
     unsigned seq_len = query_key_value.size(1);
-    unsigned hidden_dim = query_key_value.size(2) / 3;
+    int k = query_key_value.size(2) / (heads + 2 * (num_kv > 0 ? num_kv : heads));
+    unsigned hidden_dim = heads * k;
 
     bool is_prompt = (seq_len > 1);
 
     if (is_prompt) InferenceContext::Instance().reset_tokens(seq_len);
     unsigned soft_len = InferenceContext::Instance().current_tokens();
 
-    int k = hidden_dim / heads;
     auto options = at::TensorOptions()
                        .dtype(query_key_value.options().dtype())
                        .layout(at::kStrided)
@@ -486,6 +487,7 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                                       soft_len,
                                       hidden_dim,
                                       heads,
+                                      (num_kv > 0 ? num_kv : heads),
                                       rotary_dim,
                                       rotate_half,
                                       rotate_every_two,
@@ -1167,6 +1169,7 @@ at::Tensor ds_linear_layer(at::Tensor& input,
                 (num_heads * padded_head_size),
                 num_heads,
                 -1,
+                -1,
                 false,
                 false,
                 InferenceContext::Instance().GetCurrentStream(),
@@ -1192,6 +1195,7 @@ at::Tensor ds_linear_layer(at::Tensor& input,
                 input_cont.size(2),
                 num_heads,
                 -1,
+                -1,
                 false,
                 false,
                 InferenceContext::Instance().GetCurrentStream(),
@@ -1906,6 +1910,27 @@ void ds_release_workspace() { InferenceContext::Instance().release_workspace();
 
 bool ds_retake_workspace() { return InferenceContext::Instance().retake_workspace(); }
 
+template <typename T>
+at::Tensor ds_dequantize(at::Tensor& weight, at::Tensor& qscale, int groups)
+{
+    auto options = at::TensorOptions()
+                       .dtype(torch::kFloat16)
+                       .layout(at::kStrided)
+                       .device(at::kCUDA)
+                       .requires_grad(false);
+    auto weight16 = at::empty({weight.size(0), weight.size(1)}, options);
+
+    launch_dequantize((T*)weight16.data_ptr(),
+                      (int8_t*)weight.data_ptr(),
+                      (float*)qscale.data_ptr(),
+                      weight.size(0),
+                      weight.size(1),
+                      groups,
+                      InferenceContext::Instance().GetCurrentStream());
+
+    return weight16;
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("softmax_context_int8",
@@ -1973,7 +1998,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           "DeepSpeed residual add with " #_name " (CUDA)");                                       \
     m.def("allocate_workspace_" #_name,                                                           \
           &allocate_workspace<_dtype>,                                                            \
-          "DeepSpeed memory allocation for GPT inference with " #_name " (CUDA)")
+          "DeepSpeed memory allocation for GPT inference with " #_name " (CUDA)");                \
+    m.def("dequantize_" #_name,                                                                   \
+          &ds_dequantize<_dtype>,                                                                 \
+          "DeepSpeed dequantize with " #_name " (CUDA)")
 
     DEF_OPS(fp32, float);
     DEF_OPS(fp16, __half);
diff --git a/csrc/transformer/inference/csrc/transform.cu b/csrc/transformer/inference/csrc/transform.cu
index 819afa4cd26a..0b8bffa643c6 100644
--- a/csrc/transformer/inference/csrc/transform.cu
+++ b/csrc/transformer/inference/csrc/transform.cu
@@ -26,6 +26,8 @@ __global__ void bias_add_transform_0213(float* output,
                                         int seq_length,
                                         unsigned seq_offset,
                                         int heads,
+                                        int head_stride,
+                                        int num_kv,
                                         int rotary_dim,
                                         bool rotate_half,
                                         bool rotate_every_two,
@@ -49,10 +51,10 @@ __global__ void bias_add_transform_0213(float* output,
     float4* output_vec =
         reinterpret_cast<float4*>(cnt == 0 ? output : (cnt == 1 ? k_cache : v_cache));
 
-    vals_vec += (d0 * d0_stride * (gridDim.z / head_ext));
-    vals_vec += (d1 * d1_stride * (gridDim.z / head_ext));
-    vals_vec += (cnt * d1_stride);
-    vals_vec += (d2 * d2_stride);
+    vals_vec += (d0 * (d1_stride + num_kv * 2 * d2_stride) * seq_length);
+    vals_vec += d1 * (d1_stride + num_kv * 2 * d2_stride);
+    vals_vec += (cnt == 0 ? 0 : d1_stride) + (cnt == 0 ? 0 : (cnt - 1) * num_kv * d2_stride);
+    vals_vec += ((cnt == 0 ? d2 : (d2 / head_stride)) * d2_stride);
 
     output_vec += (d1 * d2_stride);
     output_vec += (d0 * d0_out_stride);
@@ -92,6 +94,8 @@ __global__ void bias_add_transform_0213(T* output,  // q
                                         unsigned seq_offset,
                                         int all_tokens,
                                         int heads,
+                                        int head_stride,
+                                        int num_kv,
                                         int rotary_dim,
                                         bool rotate_half,
                                         bool rotate_every_two,
@@ -124,10 +128,10 @@ __global__ void bias_add_transform_0213(T* output,  // q
     float4* output_vec =
         reinterpret_cast<float4*>(cnt == 0 ? output : (cnt == 1 ? k_cache : v_cache));
 
-    vals_vec += (d0 * d0_stride * (gridDim.z / head_ext));
-    vals_vec += (d1 * d1_stride * (gridDim.z / head_ext));
-    vals_vec += (cnt * d1_stride);
-    vals_vec += (d2 * d2_stride);
+    vals_vec += (d0 * (d1_stride + num_kv * 2 * d2_stride) * seq_length);
+    vals_vec += (d1 * (d1_stride + num_kv * 2 * d2_stride));
+    vals_vec += (cnt == 0 ? 0 : d1_stride) + (cnt == 0 ? 0 : (cnt - 1) * num_kv * d2_stride);
+    vals_vec += ((cnt == 0 ? d2 : (d2 / head_stride)) * d2_stride);
 
     output_vec += (d1 * d2_stride);
     output_vec += (d0 * d0_out_stride);
@@ -171,6 +175,7 @@ void launch_bias_add_transform_0213<float>(float* output,
                                            int all_tokens,
                                            int hidden_dim,
                                            int heads,
+                                           int num_kv,
                                            int rotary_dim,
                                            bool rotate_half,
                                            bool rotate_every_two,
@@ -193,6 +198,8 @@ void launch_bias_add_transform_0213<float>(float* output,
                                                                 seq_length,
                                                                 seq_offset,
                                                                 heads,
+                                                                num_kv > 0 ? (heads / num_kv) : 1,
+                                                                num_kv > 0 ? num_kv : heads,
                                                                 rotary_dim >> 2,
                                                                 rotate_half,
                                                                 rotate_every_two,
@@ -212,6 +219,7 @@ void launch_bias_add_transform_0213(T* output,
                                     int all_tokens,
                                     int hidden_dim,
                                     int heads,
+                                    int num_kv,
                                     int rotary_dim,
                                     bool rotate_half,
                                     bool rotate_every_two,
@@ -233,6 +241,8 @@ void launch_bias_add_transform_0213(T* output,
                                                                 seq_offset,
                                                                 all_tokens,
                                                                 heads,
+                                                                num_kv > 0 ? (heads / num_kv) : 1,
+                                                                num_kv > 0 ? num_kv : heads,
                                                                 rotary_dim >> 3,
                                                                 rotate_half,
                                                                 rotate_every_two,
@@ -253,6 +263,7 @@ void launch_bias_add_transform_0213(T* output,
                                                     int,          \
                                                     int,          \
                                                     int,          \
+                                                    int,          \
                                                     bool,         \
                                                     bool,         \
                                                     cudaStream_t, \
diff --git a/csrc/transformer/inference/includes/inference_cuda_layers.h b/csrc/transformer/inference/includes/inference_cuda_layers.h
index 8ba8c1c3e22c..5240ebb1d524 100644
--- a/csrc/transformer/inference/includes/inference_cuda_layers.h
+++ b/csrc/transformer/inference/includes/inference_cuda_layers.h
@@ -201,6 +201,7 @@ void launch_bias_add_transform_0213(T* outputs,
                                     int seq_length1,
                                     int hidden_dim,
                                     int heads,
+                                    int num_kv,
                                     int rotary_dim,
                                     bool rotate_half,
                                     bool rotate_every_two,
diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
index 5d54035a39fc..94c63e13329d 100755
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@@ -598,4 +598,12 @@ def _generate(self, *inputs, **kwargs):
             raise NotImplementedError("DeepSpeed does not support `num_beams` > 1, if this is important to you please "
                                       "add your request to: https://github.com/microsoft/DeepSpeed/issues/2506")
 
+        if ("input_ids" in kwargs) and (kwargs["input_ids"].dim() == 2):
+            for input_tensor in kwargs["input_ids"]:
+                tensor_length = input_tensor.shape[-1]
+                if tensor_length > self._config.max_out_tokens:
+                    raise RuntimeError(
+                        f"Input with size {tensor_length} exceeds maximum length of {self._config.max_out_tokens}. Please increase `max_tokens` in the DeepSpeed Inference Config."
+                    )
+
         return self.module.generate(*inputs, **kwargs)
diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py
index 3f4552224f81..5b879efee469 100644
--- a/deepspeed/launcher/multinode_runner.py
+++ b/deepspeed/launcher/multinode_runner.py
@@ -66,7 +66,7 @@ def parse_user_args(self):
     def get_cmd(self, environment, active_resources):
         environment['PDSH_RCMD_TYPE'] = 'ssh'
         if self.args.ssh_port is not None:  # only specify ssh port if it is specified
-            environment["PDSH_SSH_ARGS_APPEND"] = f" -p {self.args.ssh_port}"
+            environment["PDSH_SSH_ARGS_APPEND"] += f" -p {self.args.ssh_port}"
 
         active_workers = ",".join(active_resources.keys())
         logger.info("Running on the following workers: %s" % active_workers)
diff --git a/deepspeed/model_implementations/transformers/ds_llama2.py b/deepspeed/model_implementations/transformers/ds_llama2.py
new file mode 100644
index 000000000000..7d9eb4113a8a
--- /dev/null
+++ b/deepspeed/model_implementations/transformers/ds_llama2.py
@@ -0,0 +1,69 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from deepspeed import comm as dist
+from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
+
+inference_module = None
+
+
+class DeepSpeedLlama2Inference(DeepSpeedTransformerInference):
+    """Initialize the DeepSpeed OPT Transformer Layer.
+    """
+
+    def __init__(self,
+                 config,
+                 mp_group=None,
+                 quantize_scales=None,
+                 quantize_groups=1,
+                 merge_count=1,
+                 mlp_extra_grouping=False):
+        super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping)
+
+    def forward(self, *args, **kwargs):
+
+        input = args[0]
+        input_mask = None
+        # Allocate memory only on first layer forward
+        if self.config.layer_id == 0 and self._alloc_workspace:
+            self.allocate_workspace(self.config.hidden_size, self.config.heads,
+                                    input.size()[1],
+                                    input.size()[0], DeepSpeedTransformerInference.layer_id, self.config.mp_size,
+                                    self.config.bigscience_bloom,
+                                    dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens,
+                                    self.config.min_out_tokens)
+            self._alloc_workspace = False
+
+        get_present = True
+
+        # We set the prev key/value to None when there is a prompt
+        if input.shape[1] > 1:
+            self.layer_past = None
+        layer_past = self.layer_past
+
+        input_type = input.dtype
+
+        if (self.config.dtype in [torch.float16, torch.bfloat16, torch.int8]) \
+            and input.dtype == torch.float:
+            target_dtype = torch.half if self.dtype == torch.int8 else self.dtype
+            input = input.to(target_dtype)
+
+        with torch.no_grad():
+            attention_output, key, value, context_outputtn_ctx, inp_norm = \
+                                     self.attention(input,
+                                              input_mask,
+                                              None,
+                                              layer_past,
+                                              get_present,
+                                              None, None, None,
+                                              self.norm_w,
+                                              self.norm_b,
+                                              None)
+            self.layer_past = (key, value)
+            output = self.mlp(attention_output, input, inp_norm, self.attention.attn_ob)
+
+            output = output.to(input_type)
+        return output
diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
index 784700d7a702..6efa83179591 100644
--- a/deepspeed/module_inject/auto_tp.py
+++ b/deepspeed/module_inject/auto_tp.py
@@ -52,7 +52,11 @@ def strided_copy(self,
         src_split = torch.split(src.data, src.shape[outer_dim] // num_splits, dim=outer_dim)
         if (len(src_shape) == 2 and len(dst_shape) == 2):
             if src_shape[outer_dim] == dst_shape[self.out_dim]:
-                dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
+                try:
+                    dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
+                except:
+                    print(dst.shape, src.shape)
+                    exit()
                 dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
                 if hasattr(src, 'scale'):
                     dst.scale = src.scale
diff --git a/deepspeed/module_inject/containers/__init__.py b/deepspeed/module_inject/containers/__init__.py
index 1dab38b73f51..993d14071659 100644
--- a/deepspeed/module_inject/containers/__init__.py
+++ b/deepspeed/module_inject/containers/__init__.py
@@ -11,6 +11,8 @@
 from .gptneo import DS_GPTNEOContainer, HFGPTNEOLayerPolicy
 from .gptneox import DS_GPTNEOXContainer, GPTNEOXLayerPolicy
 from .llama import DS_LLAMAContainer, LLAMALayerPolicy
+from .llama2 import LLAMA2LayerPolicy, DS_LLAMA2Container
+from .internlm import DS_InternLMContainer, InternLMLayerPolicy
 from .megatron_gpt import DS_MegatronGPTContainer, MegatronLayerPolicy
 from .megatron_gpt_moe import DS_MegatronGPTMoEContainer, MegatronMoELayerPolicy
 from .opt import DS_OPTContainer, HFOPTLayerPolicy
diff --git a/deepspeed/module_inject/containers/base.py b/deepspeed/module_inject/containers/base.py
index 4498bfba8f1f..83e109167ffe 100644
--- a/deepspeed/module_inject/containers/base.py
+++ b/deepspeed/module_inject/containers/base.py
@@ -142,7 +142,7 @@ def initialize_tensors(self, enable_training=False):
         self.set_attention(*self.policy.attention(enable_training=enable_training))
         self.set_mlp(*self.policy.mlp(enable_training=enable_training))
         self.set_layernorm(*self.policy.layernorm())
-        self.check_meta_tensor_support()
+        #self.check_meta_tensor_support()
 
     def convert_to_required_dtype(self):
         # Note: converting tensors to fp16 requires that we do it in-place using self.__dict__ and not make a list/dict copy
diff --git a/deepspeed/module_inject/containers/internlm.py b/deepspeed/module_inject/containers/internlm.py
new file mode 100644
index 000000000000..31255d4b3ca5
--- /dev/null
+++ b/deepspeed/module_inject/containers/internlm.py
@@ -0,0 +1,181 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import importlib
+
+import torch
+from torch.nn.parameter import Parameter
+
+from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
+from deepspeed.utils.types import ActivationFuncType, NormType
+
+from ..policy import (TransformerPolicy, maybe_copy, maybe_copy_geglu, maybe_copy_qkv, maybe_get_lora,
+                      transformer_param_names)
+from .base import *
+from .features import HybridGatedMLPContainer, HybridSplitQKVContainer
+
+
+class DS_InternLMContainer(HybridGatedMLPContainer, HybridSplitQKVContainer, BaseTransformerContainer):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+
+        _config.rotate_half = True
+        _config.rotate_every_two = False
+        _config.rotary_dim = self.hidden_size // self.num_attention_heads
+        self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
+
+        return self.module
+
+    def set_lora_params(self):
+        """
+        Necessary to implement for `HybridEngineContainer`
+        """
+        self.lora_params = [
+            maybe_get_lora(p) for p in [
+                self.policy.client_module.mlp.up_proj.weight, self.policy.client_module.mlp.gate_proj.weight,
+                self.policy.client_module.mlp.down_proj.weight, self.policy.client_module.self_attn.q_proj.weight,
+                self.policy.client_module.self_attn.k_proj.weight, self.policy.client_module.self_attn.v_proj.weight,
+                self.policy.client_module.self_attn.o_proj.weight
+            ]
+        ]
+
+    def get_lora_matched_pair(self):
+        up_proj_lora, gate_proj_lora, down_proj_lora, q_lora, k_lora, v_lora, out_lora = self.get_lora_params()
+        ret = [(up_proj_lora, self.inter_up_w), (gate_proj_lora, self.inter_gate_w), (down_proj_lora, self._4hh_w),
+               (out_lora, self.dense_w), (q_lora, self.qw), (k_lora, self.kw), (v_lora, self.vw)]
+        return ret
+
+    def set_q_k_v(self):
+        """
+        Necessary to implement for `HybridSplitQKVContainer`
+        """
+        self.qw = self.policy.client_module.self_attn.q_proj.weight
+        self.qb = self.policy.client_module.self_attn.q_proj.bias
+        self.kw = self.policy.client_module.self_attn.k_proj.weight
+        self.kb = self.policy.client_module.self_attn.k_proj.bias
+        self.vw = self.policy.client_module.self_attn.v_proj.weight
+        self.vb = self.policy.client_module.self_attn.v_proj.bias
+
+    def set_mlp_gate(self):
+        """
+        Necessary to implement for `HybridGatedMLPContainer`
+        """
+        self.inter_up_w = self.policy.client_module.mlp.up_proj.weight
+        self.inter_up_b = None
+        self.inter_gate_w = self.policy.client_module.mlp.gate_proj.weight
+        self.inter_gate_b = None
+
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        param_names = (
+            'self_attn.q_proj.weight', \
+            'self_attn.k_proj.weight', \
+            'self_attn.v_proj.weight', \
+            'self_attn.o_proj.weight', \
+            'mlp.up_proj.weight', \
+            'mlp.gate_proj.weight', \
+            'mlp.down_proj.weight', \
+            'input_layernorm.weight', \
+            'post_attention_layernorm.weight'
+            'self_attn.q_proj.bias', \
+            'self_attn.k_proj.bias', \
+            'self_attn.v_proj.bias', \
+            'self_attn.o_proj.bias', \
+        )
+
+        maybe_copy_qkv(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       'attn_qkvw', [prefix + param_names[0], prefix + param_names[1], prefix + param_names[2]],
+                       split_qkv=self.policy.split_qkv)
+        maybe_copy_qkv(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       'attn_qkvb', [prefix + param_names[9], prefix + param_names[10], prefix + param_names[11]],
+                       split_qkv=self.policy.split_qkv)
+        maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[2],
+                   prefix + param_names[3])
+        maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[3],
+                   prefix + param_names[12])
+        maybe_copy_geglu(module.mlp, sd, weight_quantizer, mp_replace, 'inter_w',
+                         [prefix + param_names[4], prefix + param_names[5]])
+        maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, 'output_w', prefix + param_names[6])
+
+        maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[8], prefix + param_names[7])
+        maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[10], prefix + param_names[8])
+
+
+class InternLMLayerPolicy(TransformerPolicy):
+    _orig_layer_class = []
+    _orig_layer_class_inited = False
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(
+            inference,
+            mlp_act_func_type=ActivationFuncType.GATED_SILU,
+            norm_type=NormType.RMSNorm,
+        )
+        self.client_module = client_module
+
+        self._init_orig_layer_class_once()
+
+    def _init_orig_layer_class_once(self):
+        if InternLMLayerPolicy._orig_layer_class_inited:
+            return
+
+        for sub_pkg in ['', '.internlm-7b', '.internlm-chat-7b']:
+            try:
+                from transformers.utils import TRANSFORMERS_DYNAMIC_MODULE_NAME
+                module = importlib.import_module(f"{TRANSFORMERS_DYNAMIC_MODULE_NAME}{sub_pkg}.modeling_internlm")
+                if module.InternLMDecoderLayer not in InternLMLayerPolicy._orig_layer_class:
+                    InternLMLayerPolicy._orig_layer_class.append(module.InternLMDecoderLayer)
+            except ImportError:
+                continue
+
+        InternLMLayerPolicy._orig_layer_class_inited = True
+
+    def get_hidden_heads(self):
+        return self.client_module.self_attn.q_proj.weight.shape[1], \
+                self.client_module.self_attn.num_heads, \
+                self.client_module.input_layernorm.variance_epsilon, \
+                self.client_module.mlp.gate_proj.weight.shape[0]
+
+    def attention(self, enable_training=False):
+        qw = self.client_module.self_attn.q_proj.weight
+        kw = self.client_module.self_attn.k_proj.weight
+        vw = self.client_module.self_attn.v_proj.weight
+        qb = self.client_module.self_attn.q_proj.bias
+        kb = self.client_module.self_attn.k_proj.bias
+        vb = self.client_module.self_attn.v_proj.bias
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=enable_training)
+
+        return qkvw, \
+                qkvb, \
+                self.client_module.self_attn.o_proj.weight, \
+                self.client_module.self_attn.o_proj.bias
+
+    def mlp(self, enable_training=False):
+        mlp1_up = self.client_module.mlp.up_proj.weight
+        mlp1_gate = self.client_module.mlp.gate_proj.weight
+        mlp2 = self.client_module.mlp.down_proj.weight
+
+        mlp1 = Parameter(torch.cat((mlp1_up, mlp1_gate), dim=0), requires_grad=enable_training)
+
+        return mlp1, None, mlp2, None
+
+    def layernorm(self):
+        return self.client_module.post_attention_layernorm.weight, \
+               None, \
+               self.client_module.input_layernorm.weight, \
+               None
diff --git a/deepspeed/module_inject/containers/llama.py b/deepspeed/module_inject/containers/llama.py
index aa4dbbec4b8a..af99d658017c 100644
--- a/deepspeed/module_inject/containers/llama.py
+++ b/deepspeed/module_inject/containers/llama.py
@@ -4,7 +4,7 @@
 # DeepSpeed Team
 
 from .base import *
-from .features import HybridSplitQKVContainer, HybridGatedMLPContainer
+from .features import HybridSplitQKVContainer, HybridGatedMLPContainer, MetaTensorContainer
 from deepspeed.utils.types import ActivationFuncType, NormType
 from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
 import torch
@@ -20,7 +20,8 @@
 )
 
 
-class DS_LLAMAContainer(HybridGatedMLPContainer, HybridSplitQKVContainer, BaseTransformerContainer):
+class DS_LLAMAContainer(MetaTensorContainer, HybridGatedMLPContainer, HybridSplitQKVContainer,
+                        BaseTransformerContainer):
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -85,8 +86,8 @@ def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
             'mlp.up_proj.weight', \
             'mlp.gate_proj.weight', \
             'mlp.down_proj.weight', \
-            'input_layernorm.weight', \
-            'post_attention_layernorm.weight'
+            'post_attention_layernorm.weight', \
+            'input_layernorm.weight',
         )
 
         maybe_copy_qkv(module.attention,
@@ -105,6 +106,10 @@ def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
         maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[8], prefix + param_names[7])
         maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[10], prefix + param_names[8])
 
+        # This line is necessary for proper output when kernels + meta tensors are used in Llama models
+        # TODO: Investigate root-cause and fix meta tensor loading
+        module.mlp.output_b = None
+
 
 class LLAMALayerPolicy(TransformerPolicy):
 
diff --git a/deepspeed/module_inject/containers/llama2.py b/deepspeed/module_inject/containers/llama2.py
new file mode 100644
index 000000000000..b531890ab859
--- /dev/null
+++ b/deepspeed/module_inject/containers/llama2.py
@@ -0,0 +1,158 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .base import *
+from .features import HybridSplitQKVContainer, HybridGatedMLPContainer, MetaTensorContainer
+from deepspeed.utils.types import ActivationFuncType, NormType
+from deepspeed.model_implementations.transformers.ds_llama2 import DeepSpeedLlama2Inference
+import torch
+from torch.nn.parameter import Parameter
+
+from ..policy import (
+    TransformerPolicy,
+    transformer_param_names,
+    maybe_copy,
+    maybe_copy_qkv,
+    maybe_copy_geglu,
+    maybe_get_lora,
+)
+
+
+class DS_LLAMA2Container(MetaTensorContainer, HybridGatedMLPContainer, HybridSplitQKVContainer,
+                         BaseTransformerContainer):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+
+        _config.rotate_half = False
+        _config.rotate_every_two = True
+        _config.rotary_dim = self.hidden_size // self.num_attention_heads
+        _config.num_kv = self.policy.client_module.attention.n_kv_heads
+        self.module = DeepSpeedLlama2Inference(_config, mp_group=self.mp_group)
+
+        return self.module
+
+    def set_lora_params(self):
+        """
+        Necessary to implement for `HybridEngineContainer`
+        """
+        self.lora_params = [
+            maybe_get_lora(p) for p in [
+                self.policy.client_module.feed_forward.w3.weight, self.policy.client_module.feed_forward.w1.weight,
+                self.policy.client_module.feed_forward.w2.weight, self.policy.client_module.attention.wq.weight,
+                self.policy.client_module.attention.wk.weight, self.policy.client_module.attention.wv.weight,
+                self.policy.client_module.attention.wo.weight
+            ]
+        ]
+
+    def get_lora_matched_pair(self):
+        up_proj_lora, gate_proj_lora, down_proj_lora, q_lora, k_lora, v_lora, out_lora = self.get_lora_params()
+        ret = [(up_proj_lora, self.inter_up_w), (gate_proj_lora, self.inter_gate_w), (down_proj_lora, self._4hh_w),
+               (out_lora, self.dense_w), (q_lora, self.qw), (k_lora, self.kw), (v_lora, self.vw)]
+        return ret
+
+    def set_q_k_v(self):
+        """
+        Necessary to implement for `HybridSplitQKVContainer`
+        """
+        self.qw = self.policy.client_module.attention.wq.weight
+        self.qb = None
+        self.kw = self.policy.client_module.attention.wk.weight
+        self.kb = None
+        self.vw = self.policy.client_module.attention.wv.weight
+        self.vb = None
+
+    def set_mlp_gate(self):
+        """
+        Necessary to implement for `HybridGatedMLPContainer`
+        """
+        self.inter_up_w = self.policy.client_module.feed_forward.w2.weight
+        self.inter_up_b = None
+        self.inter_gate_w = self.policy.client_module.feed_forward.w1.weight
+        self.inter_gate_b = None
+
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        param_names = (
+            'attention.wq.weight', \
+            'attention.wk.weight', \
+            'attention.wv.weight', \
+            'attention.wo.weight', \
+            'feed_forward.w3.weight', \
+            'feed_forward.w1.weight', \
+            'feed_forward.w2.weight', \
+            'ffn_norm.weight', \
+            'attention_norm.weight'
+        )
+
+        maybe_copy_qkv(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       'attn_qkvw', [prefix + param_names[0], prefix + param_names[1], prefix + param_names[2]],
+                       split_qkv=self.policy.split_qkv)
+        for i in range(3, 4):
+            maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i - 1],
+                       prefix + param_names[i])
+        maybe_copy_geglu(module.mlp, sd, weight_quantizer, mp_replace, 'inter_w',
+                         [prefix + param_names[4], prefix + param_names[5]])
+        maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, 'output_w', prefix + param_names[6])
+
+        maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[8], prefix + param_names[7])
+        maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[10], prefix + param_names[8])
+
+
+class LLAMA2LayerPolicy(TransformerPolicy):
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(
+            inference,
+            mlp_act_func_type=ActivationFuncType.GATED_SILU,
+            norm_type=NormType.RMSNorm,
+        )
+        self.client_module = client_module
+        try:
+            import llama
+            LLAMA2LayerPolicy._orig_layer_class = llama.model.TransformerBlock  # type: ignore
+        except:
+            LLAMA2LayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attention.wq.weight.shape[1], \
+                self.client_module.n_heads, \
+                self.client_module.ffn_norm.eps, \
+                (self.client_module.feed_forward.w1.weight.shape[0] * \
+                    deepspeed.comm.get_world_size() if deepspeed.comm.is_initialized() else 1) # this is a hack to inject when model is already partitioned!
+
+    def attention(self, enable_training=False):
+        qw = self.client_module.attention.wq.weight
+        kw = self.client_module.attention.wk.weight
+        vw = self.client_module.attention.wv.weight
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
+
+        return qkvw, \
+                None, \
+                self.client_module.attention.wo.weight, \
+                None
+
+    def mlp(self, enable_training=False):
+        mlp1_up = self.client_module.feed_forward.w3.weight
+        mlp1_gate = self.client_module.feed_forward.w1.weight
+        mlp2 = self.client_module.feed_forward.w2.weight
+
+        mlp1 = Parameter(torch.cat((mlp1_up, mlp1_gate), dim=0), requires_grad=enable_training)
+
+        return mlp1, None, mlp2, None
+
+    def layernorm(self):
+        return self.client_module.ffn_norm.weight, \
+               None, \
+               self.client_module.attention_norm.weight, \
+               None
diff --git a/deepspeed/module_inject/load_checkpoint.py b/deepspeed/module_inject/load_checkpoint.py
index fee5da4bfe52..12b1799e49f2 100644
--- a/deepspeed/module_inject/load_checkpoint.py
+++ b/deepspeed/module_inject/load_checkpoint.py
@@ -9,6 +9,7 @@
 from deepspeed.model_implementations.transformers.ds_bert import DeepSpeedBERTInference
 from deepspeed.model_implementations.transformers.ds_megatron_gpt import DeepSpeedMegatronGPTInference
 from deepspeed.model_implementations.transformers.ds_opt import DeepSpeedOPTInference
+from deepspeed.model_implementations.transformers.ds_llama2 import DeepSpeedLlama2Inference
 
 import deepspeed.ops.transformer as transformer_inference
 from .layers import LinearLayer, Normalize, EmbeddingLayer, OPTEmbedding, RMSNormalize
@@ -185,6 +186,20 @@ def load_parameters(module, prefix):
             LlamaRMSNorm = None
     except:
         OPTLearnedPositionalEmbedding = None
+    try:
+        from fairscale.nn.model_parallel.layers import (
+            ColumnParallelLinear,
+            ParallelEmbedding,
+            RowParallelLinear,
+        )
+    except:
+        ColumnParallelLinear = None
+        ParallelEmbedding = None
+        RowParallelLinear = None
+    try:
+        from llama.model import RMSNorm
+    except:
+        RMSNorm = None
     layer_policies = {
         nn.Linear: load,
         nn.Embedding: load,
@@ -198,10 +213,15 @@ def load_parameters(module, prefix):
         DeepSpeedBERTInference: load_transformer_layer,
         DeepSpeedMegatronGPTInference: load_transformer_layer,
         DeepSpeedOPTInference: load_transformer_layer,
+        DeepSpeedLlama2Inference: load_transformer_layer,
         OPTLearnedPositionalEmbedding: load,
         OPTEmbedding: load,
         LlamaRMSNorm: load,
-        RMSNormalize: load
+        RMSNormalize: load,
+        ColumnParallelLinear: load,
+        ParallelEmbedding: load,
+        RowParallelLinear: load,
+        RMSNorm: load
     }
 
     all_ds_ids = {}
@@ -228,14 +248,16 @@ def load_module_recursive(module, prefix='', level=0):
                     if child.__class__ is nn.LayerNorm:
                         child = Normalize(dim=ds_shape[-1], dtype=child.weight.dtype, eps=child.eps)
                         setattr(module, name, child)
-                    elif child.__class__ is nn.Linear:
+                    elif child.__class__ in [nn.Linear, ColumnParallelLinear, RowParallelLinear]:
                         child = LinearLayer(weight_shape=child.weight.shape, bias=child.bias)
                         setattr(module, name, child)
                     elif child.__class__ is OPTLearnedPositionalEmbedding:
                         child = OPTEmbedding(weight_shape=ds_shape)
                         setattr(module, name, child)
-                    elif child.__class__ is LlamaRMSNorm:
-                        child = RMSNormalize(dim=ds_shape[-1], dtype=child.weight.dtype, eps=child.variance_epsilon)
+                    elif child.__class__ in [LlamaRMSNorm, RMSNorm]:
+                        child = RMSNormalize(dim=ds_shape[-1],
+                                             dtype=child.weight.dtype,
+                                             eps=child.eps if hasattr(child, 'eps') else child.variance_epsilon)
                         setattr(module, name, child)
                     else:
                         ds_id = None
@@ -254,7 +276,6 @@ def load_module_recursive(module, prefix='', level=0):
                     level + 1)
 
     load_module_recursive(r_module)
-
     embedding_weight = None
 
     for n, p in r_module.named_parameters():
@@ -262,6 +283,7 @@ def load_module_recursive(module, prefix='', level=0):
             embedding_weight = p
     if embedding_weight is not None and r_module.lm_head.weight.is_meta:
         r_module.lm_head.weight = embedding_weight
+
     for sd_ in sd:
         del sd_
     sd = None
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index 3099aa3a4cb2..af3c4f1c885d 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -301,11 +301,12 @@ def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None):
         checkpoint = checkpoint_dict["checkpoints"]
         pbar = tqdm.tqdm(total=len(checkpoint), desc=f"Loading {len(checkpoint)} checkpoint shards")
         for i in range(len(checkpoint)):
+            checkpoint_file = os.path.join(config.base_dir, checkpoint[i])
             replaced_module = replace_module(model=model,
                                              orig_class=orig_layer_impl,
                                              replace_fn=replace_fn,
                                              _replace_policy=config.injection_policy_tuple,
-                                             checkpoint=checkpoint[i])
+                                             checkpoint=checkpoint_file)
             pbar.update(1)
             gc.collect()
     else:
diff --git a/deepspeed/module_inject/replace_policy.py b/deepspeed/module_inject/replace_policy.py
index c49b8f81c430..2c06e31aaa41 100755
--- a/deepspeed/module_inject/replace_policy.py
+++ b/deepspeed/module_inject/replace_policy.py
@@ -16,11 +16,14 @@
 from .containers import LLAMALayerPolicy
 from .containers import UNetPolicy
 from .containers import VAEPolicy
+from .containers import LLAMA2LayerPolicy
+from .containers import InternLMLayerPolicy
 
 # transformer-based policies
 replace_policies = [
     HFBertLayerPolicy, HFGPTNEOLayerPolicy, GPTNEOXLayerPolicy, HFGPTJLayerPolicy, MegatronLayerPolicy,
-    HFGPT2LayerPolicy, BLOOMLayerPolicy, HFOPTLayerPolicy, HFCLIPLayerPolicy, HFDistilBertLayerPolicy, LLAMALayerPolicy
+    HFGPT2LayerPolicy, BLOOMLayerPolicy, HFOPTLayerPolicy, HFCLIPLayerPolicy, HFDistilBertLayerPolicy,
+    LLAMALayerPolicy, LLAMA2LayerPolicy, InternLMLayerPolicy
 ]
 
 # non-transformer-based policies
diff --git a/deepspeed/module_inject/utils.py b/deepspeed/module_inject/utils.py
index c442d24fd3b6..42822128f9e1 100644
--- a/deepspeed/module_inject/utils.py
+++ b/deepspeed/module_inject/utils.py
@@ -18,6 +18,8 @@ def policy_to_ds_container(**kwargs):
     from .containers import MegatronLayerPolicy, DS_MegatronGPTContainer
     from .containers import HFDistilBertLayerPolicy, DS_DistilBERTContainer
     from .containers import LLAMALayerPolicy, DS_LLAMAContainer
+    from .containers import LLAMA2LayerPolicy, DS_LLAMA2Container
+    from .containers import InternLMLayerPolicy, DS_InternLMContainer
 
     policy_to_container = {
         HFGPT2LayerPolicy: DS_GPT2Container,
@@ -30,6 +32,8 @@ def policy_to_ds_container(**kwargs):
         MegatronLayerPolicy: DS_MegatronGPTContainer,
         HFDistilBertLayerPolicy: DS_DistilBERTContainer,
         LLAMALayerPolicy: DS_LLAMAContainer,
+        LLAMA2LayerPolicy: DS_LLAMA2Container,
+        InternLMLayerPolicy: DS_InternLMContainer
     }
 
     container = None
diff --git a/deepspeed/ops/deepspeed4science/__init__.py b/deepspeed/ops/deepspeed4science/__init__.py
new file mode 100644
index 000000000000..1c5fd280fc32
--- /dev/null
+++ b/deepspeed/ops/deepspeed4science/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .evoformer_attn import DS4Sci_EvoformerAttention, EvoformerFusedAttention
diff --git a/deepspeed/ops/deepspeed4science/evoformer_attn.py b/deepspeed/ops/deepspeed4science/evoformer_attn.py
new file mode 100644
index 000000000000..ba7e20e51d50
--- /dev/null
+++ b/deepspeed/ops/deepspeed4science/evoformer_attn.py
@@ -0,0 +1,102 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import numpy as np
+from deepspeed.ops.op_builder import EvoformerAttnBuilder
+from deepspeed.accelerator import get_accelerator
+
+kernel_ = None
+
+
+def _attention(Q, K, V, bias1, bias2):
+    assert Q.shape[-3] > 16, "seq_len must be greater than 16"
+    O = torch.empty_like(Q, dtype=Q.dtype)
+    assert get_accelerator().on_accelerator(Q), "Q must be on cuda"
+    assert get_accelerator().on_accelerator(K), "K must be on cuda"
+    assert get_accelerator().on_accelerator(V), "V must be on cuda"
+    assert get_accelerator().on_accelerator(bias1), "bias1 must be on cuda"
+    assert get_accelerator().on_accelerator(bias2), "bias2 must be on cuda"
+    global kernel_
+    if kernel_ is None:
+        kernel_ = EvoformerAttnBuilder().load()
+    nheads = Q.shape[-2]
+    nq = (Q.shape[-3] + 31) // 32 * 32
+    nb = np.prod(Q.shape[:-3])
+    lse = torch.empty((nb, nheads, nq), dtype=torch.float32, device=Q.device)
+    kernel_.attention(Q, K, V, bias1, bias2, O, lse)
+    return O, lse
+
+
+def attention_bwd(dO, Q, K, V, O, lse, bias1, bias2):
+    assert max(Q.shape[-1], V.shape[-1]) <= 64, "Hidden size is too large. Need to change kMax to a larger value"
+    dQ = torch.empty_like(Q, dtype=Q.dtype)
+    dK = torch.empty_like(K, dtype=K.dtype)
+    dV = torch.empty_like(V, dtype=V.dtype)
+    assert get_accelerator().on_accelerator(dO), "dO must be on cuda"
+    assert get_accelerator().on_accelerator(Q), "Q must be on cuda"
+    assert get_accelerator().on_accelerator(K), "K must be on cuda"
+    assert get_accelerator().on_accelerator(V), "V must be on cuda"
+    assert get_accelerator().on_accelerator(O), "O must be on cuda"
+    global kernel_
+    if kernel_ is None:
+        kernel_ = EvoformerAttnBuilder().load()
+    delta = torch.empty_like(lse)
+    dB1 = torch.zeros_like(bias1, dtype=torch.float32)
+    dB2 = torch.zeros_like(bias2, dtype=torch.float32)
+    kernel_.attention_bwd(dO, Q, K, V, O, lse, delta, bias1, bias2, dQ, dK, dV, dB1, dB2)
+    return dQ, dK, dV, dB1.to(dO.dtype), dB2.to(dO.dtype)
+
+
+class EvoformerFusedAttention(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, q, k, v, bias1=None, bias2=None):
+        """
+        q, k, v: are in shape [*, L, H, D]
+        """
+        bias1_ = bias1.contiguous() if bias1 is not None else torch.tensor([], dtype=q.dtype, device=q.device)
+        bias2_ = bias2.contiguous() if bias2 is not None else torch.tensor([], dtype=q.dtype, device=q.device)
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        o, lse = _attention(q, k, v, bias1_, bias2_)
+        ctx.save_for_backward(q, k, v, o, lse, bias1_, bias2_)
+        return o
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        q, k, v, o, lse, bias1, bias2 = ctx.saved_tensors
+        dQ, dK, dV, dB1, dB2 = attention_bwd(grad_output, q, k, v, o, lse, bias1, bias2)
+        if bias1.numel() == 0:
+            dB1 = None
+        if bias2.numel() == 0:
+            dB2 = None
+        return dQ, dK, dV, dB1, dB2
+
+
+def DS4Sci_EvoformerAttention(Q, K, V, biases):
+    assert len(biases) <= 2
+
+    if (len(biases) == 0):
+        biases.append(None)
+
+    if (len(biases) == 1):
+        biases.append(None)
+
+    bias_1_shape = lambda x: (x.shape[0], x.shape[1], 1, 1, x.shape[2])
+    bias_2_shape = lambda x: (x.shape[0], 1, x.shape[3], x.shape[2], x.shape[2])
+
+    if biases[0] is not None:
+        assert biases[0].shape == bias_1_shape(Q)
+    else:
+        biases[0] = Q.new_zeros(bias_1_shape(Q))
+
+    if biases[1] is not None:
+        assert biases[1].shape == bias_2_shape(Q)
+    else:
+        biases[1] = Q.new_zeros(bias_2_shape(Q))
+
+    return EvoformerFusedAttention.apply(Q, K, V, biases[0], biases[1])
diff --git a/deepspeed/ops/transformer/inference/config.py b/deepspeed/ops/transformer/inference/config.py
index d192e7fffa84..4e29a2137c64 100644
--- a/deepspeed/ops/transformer/inference/config.py
+++ b/deepspeed/ops/transformer/inference/config.py
@@ -78,7 +78,8 @@ def __init__(self,
                  set_empty_params=False,
                  transposed_mode=False,
                  use_triton=False,
-                 triton_autotune=False):
+                 triton_autotune=False,
+                 num_kv=-1):
         super(DeepSpeedInferenceConfig,
               self).__init__(hidden_size, (intermediate_size if intermediate_size > 0 else 4 * hidden_size), heads,
                              num_hidden_layers)
@@ -112,6 +113,7 @@ def __init__(self,
         self.transposed_mode = transposed_mode
         self.use_triton = use_triton
         self.triton_autotune = triton_autotune
+        self.num_kv = num_kv
 
     @classmethod
     def from_dict(cls, json_object):
diff --git a/deepspeed/ops/transformer/inference/ds_attention.py b/deepspeed/ops/transformer/inference/ds_attention.py
index 967f1d4b8d9d..eb6ce2f75c69 100644
--- a/deepspeed/ops/transformer/inference/ds_attention.py
+++ b/deepspeed/ops/transformer/inference/ds_attention.py
@@ -37,7 +37,8 @@ def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count
             self.attn_ow = None
             self.attn_ob = None
         else:
-            qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3
+            qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3 if config.num_kv < 0 else \
+                                     ((self.config.heads + self.config.num_kv * 2) // self.config.mp_size) * (self.config.hidden_size // self.config.heads)
             self.attn_qkvw = nn.Parameter(torch.empty(self.config.hidden_size,
                                                       qkv_size_per_partition,
                                                       dtype=data_type,
@@ -56,6 +57,7 @@ def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count
                                         requires_grad=False)
 
         self.num_attention_heads_per_partition = self.config.heads // self.config.mp_size
+        self.num_kv_partition = self.config.num_kv // self.config.mp_size
         self.hidden_size_per_partition = self.config.hidden_size // self.config.mp_size
         self.hidden_size_per_attention_head = self.config.hidden_size // self.config.heads
 
@@ -101,6 +103,7 @@ def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
             attn_mask=((1 - input_mask).to(qkv_out.dtype) *
                        minus_inf) if input_mask.dtype == torch.int64 else input_mask,
             heads=self.num_attention_heads_per_partition,
+            num_kv=self.num_kv_partition,
             norm_factor=(1 / self.norm_factor if self.config.scale_attention else 1.0),
             no_masking=no_masking,
             layer_id=self.config.layer_id,
@@ -139,7 +142,6 @@ def forward(self,
         else:
             self._attn_qkvw = self.attn_qkvw
             self._attn_qkvb = self.attn_qkvb
-
         if not self.config.pre_layer_norm:
             qkv_out = self.linear_func(input=input,
                                        weight=self._attn_qkvw,
@@ -159,12 +161,12 @@ def forward(self,
                                                                        input_mask=input_mask,
                                                                        layer_past=layer_past,
                                                                        alibi=alibi)
+
         output = self.vector_matmul_func(input=context_layer, weight=self.attn_ow)
         inp_norm = qkv_out[-1]
 
         if self.config.mlp_after_attn and self.mp_group is not None and dist.get_world_size(group=self.mp_group) > 1:
             dist.all_reduce(output, group=self.mp_group)
-
         return (output, key_layer, value_layer, context_layer, inp_norm)
 
 
@@ -247,6 +249,11 @@ def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
 
         offset = dist.get_rank() * self.num_attention_heads_per_partition if dist.is_initialized() else 0
         target_dtype = torch.float16 if self.config.dtype == torch.int8 else self.config.dtype
+
+        # When using the hybrid engine with BLOOM, input_mask needs to be converted from torch.bool -> torch.int64
+        if input_mask.dtype == torch.bool:
+            input_mask = input_mask.long()
+
         attention_probs = self.softmax_func(attn_scores=attention_scores,
                                             attn_mask=((1 - input_mask).to(target_dtype) * minus_inf),
                                             alibi=alibi,
diff --git a/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
index d5e12cb9a801..3064c00d1755 100644
--- a/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
@@ -85,6 +85,8 @@ def forward(self,
                 self.config.mlp_act_func_type,
                 self.config.transposed_mode)
         else:
+            if input_bias is not None:
+                input += input_bias
             output, residual_add = self.mlp_gemm_func(
                 input,
                 residual,
diff --git a/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
index dca935c1eb11..250bf9864e1e 100644
--- a/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
@@ -84,5 +84,7 @@ def forward(self, input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor,
         else:
             output, norm = self.qkv_gemm_func(input, weight, q_scale, gamma, self.config.epsilon, q_int8,
                                               self.config.transposed_mode)
+            if add_bias:
+                output += bias
 
         return output, norm
diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
index 3cc75860a752..012399ea1ef3 100644
--- a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
@@ -28,8 +28,8 @@ def softmax_context_fallback(self, query_key_value, attn_mask, rotary_dim, rotat
                                  num_layers, alibi):
         raise NotImplementedError
 
-    def forward(self, query_key_value: torch.Tensor, attn_mask: torch.Tensor, heads: int, norm_factor: float,
-                no_masking: bool, layer_id: int, num_layers: int, alibi: torch.Tensor):
+    def forward(self, query_key_value: torch.Tensor, attn_mask: torch.Tensor, heads: int, num_kv: int,
+                norm_factor: float, no_masking: bool, layer_id: int, num_layers: int, alibi: torch.Tensor):
 
         if alibi is not None:
             batch_heads = query_key_value.shape[0] * heads
@@ -39,7 +39,7 @@ def forward(self, query_key_value: torch.Tensor, attn_mask: torch.Tensor, heads:
             alibi = torch.empty(1)
 
         output = self.softmax_context_func(query_key_value, attn_mask, self.config.rotary_dim, self.config.rotate_half,
-                                           self.config.rotate_every_two, heads, norm_factor,
+                                           self.config.rotate_every_two, heads, num_kv, norm_factor,
                                            self.config.triangular_masking, self.config.local_attention,
                                            self.config.window_size, no_masking, layer_id, num_layers, alibi)
 
diff --git a/deepspeed/ops/transformer/inference/triton/attention.py b/deepspeed/ops/transformer/inference/triton/attention.py
index 3bb0f47f413c..c05370ec74e5 100644
--- a/deepspeed/ops/transformer/inference/triton/attention.py
+++ b/deepspeed/ops/transformer/inference/triton/attention.py
@@ -6,6 +6,8 @@
 import math
 import torch
 import torch.nn as nn
+import triton
+import triton.language as tl
 from deepspeed.accelerator import get_accelerator
 from deepspeed import comm as dist
 from deepspeed.ops.transformer.inference.op_binding import LinearOp, VectorMatMulOp, SoftmaxContextOp, QKVGemmOp
@@ -70,6 +72,9 @@ def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count
 
         self.mp_group = mp_group
         self.use_flash = False
+        # triton flash attention is enabled when the compute capability >= 8.0
+        if get_accelerator().is_triton_supported():
+            self.use_flash = True
 
         # used for quantization
         self.q_scales = q_scales
@@ -98,7 +103,7 @@ def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count
 
         # triton autotune table update for score/context matmul
         if triton_autotune:
-            print(f"running triton autotune for attention")
+            print(f"running triton autotune for regular attention kernel")
             __class__._triton_autotune(2, self.config.max_out_tokens, self.head_size, self.config.hidden_size,
                                        self.triangular_masking, self.scale)
 
@@ -176,7 +181,7 @@ def forward(
             qkv = qkv_out[0]
 
         if use_triton_attention and (alibi is None):
-            context_layer = compute_attention(qkv=qkv,
+            context_layer = _triton_attention(qkv=qkv,
                                               input_mask=input_mask,
                                               scale=self.scale,
                                               layer_past=layer_past,
@@ -204,7 +209,7 @@ def forward(
 global inference_module
 
 
-def compute_attention(qkv,
+def _triton_attention(qkv,
                       input_mask,
                       layer_past,
                       alibi,
@@ -217,13 +222,166 @@ def compute_attention(qkv,
     if isinstance(qkv, list):
         qkv = qkv[0]
 
-    #assert layer_past is None, "layer_past not supported in triton yet"
     assert alibi is None, "layer_past not supported in alibi yet"
-    output = score_4d_matmul(qkv, head_size, triangular, scale)
-    if triangular:
-        output = softmax(output)
+
+    if use_triton_flash:
+        output = _triton_packed_flash(qkv,
+                                      head_size,
+                                      input_mask,
+                                      scale,
+                                      causal=triangular,
+                                      add_mask=(not triangular and input_mask is not None))
     else:
-        output = softmax(output, input_mask)
-    output = context_4d_matmul(output, qkv, head_size)
+        output = score_4d_matmul(qkv, head_size, triangular, scale)
+        if triangular:
+            output = softmax(output)
+        else:
+            output = softmax(output, input_mask)
+        output = context_4d_matmul(output, qkv, head_size)
 
     return output
+
+
+'''
+flash attention 2
+modified the triton kernel in
+https://github.com/openai/triton/blob/08c16589573621fcb8cd5a9c3b8a0537077f876d/python/tutorials/06-fused-attention.py
+'''
+
+
+@triton.jit
+def _flash_packed_kernel(
+    QKV,
+    mask,
+    ADD_MASK: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    sm_scale,
+    Out,
+    stride_qz,
+    stride_qn,
+    stride_qm,
+    stride_mz,
+    stride_oz,
+    stride_on,
+    Z,
+    H,
+    N_CTX,
+    P_SEQ,
+    hidden_size,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    batch = off_hz // H
+    head = off_hz % H
+
+    q_offset = batch * stride_qz + head * BLOCK_DMODEL
+    k_offset = q_offset + hidden_size
+    v_offset = k_offset + hidden_size
+
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+
+    q_ptrs = QKV + q_offset + offs_m[:, None] * stride_qn + offs_d[None, :]
+    k_ptrs = QKV + hidden_size + q_offset + offs_n[:, None] * stride_qn + offs_d[None, :]
+    v_ptrs = QKV + 2 * hidden_size + q_offset + offs_n[:, None] * stride_qn + offs_d[None, :]
+
+    # mask
+    off_mask = batch * stride_mz + offs_n[None, :]
+    mask_ptrs = mask + off_mask
+
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use
+    # 2^x instead of exp in the loop because CSE and LICM
+    # don't work as expected with `exp` in the loop
+    qk_scale = sm_scale * 1.44269504
+    # load q: it will stay in SRAM throughout
+    q = tl.load(q_ptrs, mask=offs_m[:, None] < N_CTX, other=0.0)
+    q = (q * qk_scale).to(tl.float16)
+    # loop over k, v and update accumulator
+    lo = 0
+    hi = P_SEQ + (start_m + 1) * BLOCK_M if IS_CAUSAL else N_CTX + P_SEQ
+    for start_n in range(lo, hi, BLOCK_N):
+        # -- load k, v --
+        k = tl.load(k_ptrs + start_n * stride_qn, mask=(start_n + offs_n)[:, None] < N_CTX, other=0.0)
+        v = tl.load(v_ptrs + start_n * stride_qn, mask=(start_n + offs_n)[:, None] < N_CTX, other=0.0)
+        # -- compute qk ---
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float16)
+
+        if ADD_MASK:
+            mask_val = tl.load(mask_ptrs)
+            mask_ptrs += BLOCK_N
+            qk = qk + mask_val.to(tl.float32)
+
+        if IS_CAUSAL:
+            qk = tl.where(P_SEQ + offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf"))
+
+        qk += tl.dot(q, tl.trans(k), out_dtype=tl.float16)
+        qk += tl.where((start_n + offs_n)[None, :] < N_CTX, 0, minus_inf)
+        # -- compute scaling constant ---
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk - m_i_new[:, None])
+        # -- scale and update acc --
+        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
+        acc *= acc_scale[:, None]
+        acc += tl.dot(p.to(tl.float16), v.to(tl.float16))
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+
+    # write back l and m
+    acc = acc / l_i[:, None]
+    o_offset = batch * stride_oz + head * BLOCK_DMODEL
+    out_ptrs = Out + o_offset + (offs_m[:, None] * stride_on + offs_d[None, :])
+    tl.store(out_ptrs, acc.to(tl.float16), mask=offs_m[:, None] < N_CTX)
+
+
+def _triton_packed_flash(qkv, head_size, mask, sm_scale, causal=False, add_mask=True):
+    heads = qkv.shape[-1] // 3 // head_size
+    hidden_size = qkv.shape[-1] // 3
+
+    BLOCK_M = 128
+    BLOCK_N = 64 if head_size <= 64 else 32
+
+    o = torch.empty((qkv.shape[0], qkv.shape[1], hidden_size), device=qkv.device, dtype=torch.half)
+    if mask is None:
+        mask = torch.empty(0)
+        add_mask = False
+
+    grid = (triton.cdiv(qkv.shape[1], BLOCK_M), qkv.shape[0] * heads, 1)
+    num_stages = 4 if head_size <= 64 else 3
+    num_warps = 4
+    P_SEQ = 0
+
+    _flash_packed_kernel[grid](qkv,
+                               mask,
+                               add_mask,
+                               causal,
+                               sm_scale,
+                               o,
+                               qkv.stride(0),
+                               qkv.stride(1),
+                               qkv.stride(2),
+                               mask.stride(1) if add_mask else 0,
+                               o.stride(0),
+                               o.stride(1),
+                               qkv.shape[0],
+                               heads,
+                               qkv.shape[1],
+                               P_SEQ,
+                               hidden_size,
+                               BLOCK_M=BLOCK_M,
+                               BLOCK_N=BLOCK_N,
+                               BLOCK_DMODEL=head_size,
+                               num_warps=num_warps,
+                               num_stages=num_stages)
+
+    return o
diff --git a/deepspeed/ops/transformer/inference/triton/triton_matmul_kernel.py b/deepspeed/ops/transformer/inference/triton/triton_matmul_kernel.py
index 9d647ea090f6..e2128e046df0 100644
--- a/deepspeed/ops/transformer/inference/triton/triton_matmul_kernel.py
+++ b/deepspeed/ops/transformer/inference/triton/triton_matmul_kernel.py
@@ -12,11 +12,33 @@
 SKIP_AUTOTUNE = False
 
 
+def _triton_ops_matmul_early_config_prune(configs, named_args):
+    device = torch.cuda.current_device()  #ignore-cuda
+    capability = torch.cuda.get_device_capability()  #ignore-cuda
+    # BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages
+    dtsize = named_args['A'].element_size()
+    dtype = named_args['A'].dtype
+
+    # 1. make sure we have enough smem
+    pruned_configs = []
+    for config in configs:
+        kw = config.kwargs
+        BLOCK_M, BLOCK_N, BLOCK_K, num_stages = \
+            kw['BLOCK_M'], kw['BLOCK_N'], kw['BLOCK_K'], config.num_stages
+
+        max_shared_memory = triton.runtime.driver.utils.get_device_properties(device)["max_shared_mem"]
+        required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
+        if required_shared_memory <= max_shared_memory:
+            pruned_configs.append(config)
+
+    return pruned_configs
+
+
 def _fp16_matmul_prune_config(configs, named_args, skip_autotune=SKIP_AUTOTUNE):
     if skip_autotune:
         configs = [configs[0]]
     else:
-        configs = triton.ops.matmul_perf_model.early_config_prune(configs, named_args)
+        configs = _triton_ops_matmul_early_config_prune(configs, named_args)
     return configs
 
 
@@ -199,8 +221,7 @@ def matmul_4d_prune_config(configs, named_args, skip_autotune=SKIP_AUTOTUNE):
             BLOCK_M, BLOCK_N, BLOCK_K, num_stages = \
                 kw['BLOCK_SIZE_M'], kw['BLOCK_SIZE_N'], kw['BLOCK_SIZE_K'], config.num_stages
 
-            triton.compiler.init_cuda_utils()
-            max_shared_memory = triton.compiler.cuda_utils.get_device_properties(device)["max_shared_mem"]
+            max_shared_memory = triton.runtime.driver.utils.get_device_properties(device)["max_shared_mem"]
             required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
             if required_shared_memory <= max_shared_memory:
                 pruned_configs.append(config)
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index fd1b829a62c7..4e8a0faa0ae0 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -366,6 +366,8 @@ def train_batch(self, data_iter=None):
                       f'loss: {self.agg_train_loss:0.4f} '
                       f'iter time (s): {iter_time:0.3f} '
                       f'samples/sec: {tput:0.3f}')
+            else:
+                self.timers(TRAIN_BATCH_TIMER).elapsed(reset=True)
 
         # Monitoring
         if self.global_rank == 0 and self.monitor.enabled:
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 7715c320929e..9509b5a692ca 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -1116,7 +1116,7 @@ def __add_grad_to_ipg_bucket(self, param: Parameter) -> None:
         if not get_accelerator().is_synchronized_device():
             self.reduce_and_partition_stream.wait_stream(get_accelerator().default_stream())
 
-        if self.contiguous_gradients and self.elements_in_ipg_bucket + param.grad.numel() < self.reduce_bucket_size:
+        if self.contiguous_gradients and self.elements_in_ipg_bucket + param.grad.numel() <= self.reduce_bucket_size:
             # move the gradient to a contiguous buffer
             with get_accelerator().stream(self.reduce_and_partition_stream):
                 # move the parameter's gradient to the contiguous flat buffer
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index 50121519e179..3c1efaad27a0 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -389,13 +389,6 @@ def __init__(self,
             self.params_not_in_partition.append(params_not_in_partition)
             self.first_offset.append(first_offset)
 
-        for rank in range(dist.get_world_size()):
-            if dist.get_rank() == rank:
-                print(
-                    f"Rank: {rank} partition count {self.partition_count} and sizes{[(p.numel(), self.is_moe_param_group[i] if hasattr(self, 'is_moe_param_group') else False) for i,p in enumerate(self.single_partition_of_fp32_groups)]} "
-                )
-                dist.barrier()
-
         self.reduce_bucket_size = int(reduce_bucket_size)
         self.allgather_bucket_size = int(allgather_bucket_size)
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 7127b8459fe2..ac8d9028e58f 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -41,6 +41,7 @@ collections:
       - cifar-10.md
       - curriculum-learning.md
       - data-efficiency.md
+      - ds4sci_evoformerattention.md
       - flops-profiler.md
       - pytorch-profiler.md
       - autotuning.md
diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
index d0587867c260..217d56c14812 100755
--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -17,6 +17,8 @@ lnav:
     url: /inference/
   - title: 'Compression'
     url: /compression/
+  - title: 'Science'
+    url: /deepspeed4science/
   - title: 'Getting Started'
     url: /getting-started/
   - title: 'ds_config'
@@ -67,6 +69,8 @@ lnav:
         url: /tutorials/curriculum-learning/
       - title: 'Data Efficiency'
         url: /tutorials/data-efficiency/
+      - title: 'DS4Sci_EvoformerAttention'
+        url: /tutorials/ds4sci_evoformerattention/
       - title: 'Flops Profiler'
         url: /tutorials/flops-profiler/
       - title: 'PyTorch Profiler'
diff --git a/docs/_pages/deepspeed4science.md b/docs/_pages/deepspeed4science.md
new file mode 100755
index 000000000000..6dd87ce996e2
--- /dev/null
+++ b/docs/_pages/deepspeed4science.md
@@ -0,0 +1,39 @@
+---
+title: "DeepSpeed4Science Overview and Tutorial"
+permalink: /deepspeed4science/
+toc: true
+toc_label: "Contents"
+toc_sticky: true
+---
+
+In line with Microsoft's mission to solve humanity's most pressing challenges, the DeepSpeed team at Microsoft is responding to this opportunity by launching a new initiative called *DeepSpeed4Science*, aiming to build unique capabilities through AI system technology innovations to help domain experts to unlock today's biggest science mysteries. This page serves as an overview page for all technologies released (or to be released in the future) as part of the DeepSpeed4Science initiative, making it easier for scientists to shop for techniques they need. Details of the DeepSpeed4Science initiative can be found at [our website](https://deepspeed4science.ai/). For each technique we will introduce what is it for, when to use it, links to how to use it, and existing scientific applications of the techniques (we welcome users to contribute more showcases if you apply our techniques in your scientific research):
+
+* [2023/09] We are releasing two techniques: [DeepSpeed4Science large-scale training framework](#new-megatron-deepspeed-for-large-scale-ai4science-model-training), [DS4Sci_EvoformerAttention](#memory-efficient-evoformerattention-kernels) and their scientific applications in structural biology research.
+
+
+## New Megatron-DeepSpeed for Large-Scale AI4Science Model Training
+
+We are proud to introduce [new Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed), which is an updated framework for large-scale model training. We rebased and enabled DeepSpeed with the newest Megatron-LM for long sequence support and many other capabilities. With the new Megatron-DeepSpeed, users can now train their large AI4Science models like GenSLMs with much longer sequences via a synergetic combination of ZeRO-style data parallelism, tensor parallelism, sequence parallelism, pipeline parallelism, model state offloading, and several newly added memory optimization techniques such as attention mask offloading and position embedding partitioning.
+
+![new Megatron-DeepSpeed](/assets/images/new-megatron-ds.png){: .align-center}
+<p align="center">
+<em>The figure depicts system capability in terms of enabling long sequence lengths for training a 33B parameter GPT-like model using our new Megatron-DeepSpeed framework. The results show that the new Megatron-DeepSpeed enables 9x longer sequence lengths than NVIDIA's Megatron-LM without triggering out-of-memory error. </em>
+</p>
+
+To see how the new Megatron-DeepSpeed helps enabling new system capabilities, such as training models with massive sequences length, please read our [tutorial](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support).
+
+Meanwhile, our new Megatron-DeepSpeed has been applied to genome-scale foundation model [GenSLMs](https://github.com/ramanathanlab/genslm), which is a 2022 [ACM Gordon Bell award](https://www.acm.org/media-center/2022/november/gordon-bell-special-prize-covid-research-2022) winning genome-scale language model from Argonne National Lab. To achieve their scientific goal, GenSLMs and similar models require very long sequence support for both training and inference that is beyond generic LLM's long-sequence strategies. By leveraging DeepSpeed4Science's new Megatron-DeepSpeed, GenSLMs team is able to train their 25B model with 512K sequence length, much longer than their original 42K sequence length. Detailed information about the methodology can be found at [our website](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/). GenSLMs team also hosts an [example](https://github.com/ramanathanlab/genslm/tree/main/examples/long-sequences) about how to use DeepSpeed4Science in the GenSLMs repo.
+
+
+## Memory-Efficient EvoformerAttention Kernels
+
+[Evoformer](https://www.nature.com/articles/s41586-021-03819-2) is a key building block for scientific models such as DeepMind's AlphaFold. However, EvoFormer's multiple sequence alignment (MSA) attention frequently runs into memory explosion problems during training/inference, such as in protein structure prediction models. Existing techniques such as FlashAttention cannot effectively support Evoformer because EvoFormerAttention uses row-wise/column-wise/triangle attention, which are different from standard Transformer self-attention and cross-attention that require custom optimizations. To mitigate the memory explosion problem, we introduce `DS4Sci_EvoformerAttention` kernels, a collection of kernels that improve the memory efficiency of variants of EvoFormer. `DS4Sci_EvoformerAttention` is easy-to-use. To see how you can use it, please refer to our [tutorial](/tutorials/ds4sci_evoformerattention/).
+
+`DS4Sci_EvoformerAttention` has already been applied to [OpenFold](https://github.com/aqlaboratory/openfold), which is a community reproduction of DeepMind's AlphaFold2 that makes it possible to train or finetune AlphaFold2 on new datasets. With DS4Sci_EvoformerAttention kernels, OpenFold team is able to reduce the peak memory requirement by 13x without accuracy loss. Detailed information about the methodology can be found at [our website](https://deepspeed4science.ai/2023/09/18/model-showcase-openfold/).
+
+<!-- OpenFold team also hosts an [example](https://github.com/aqlaboratory/openfold/blob/main/tests/test_deepspeed_evo_attention.py) about how to use DS4Sci_EvoformerAttention in the OpenFold repo. -->
+
+![DS4Sci_EvoformerAttention](/assets/images/evoformer.png){: .align-center}
+<p align="center">
+<em>The figure shows that DeepSpeed's EvoFormerAttention kernels help reduce OpenFold’s peak memory requirement for training by 13X. </em>
+</p>
diff --git a/docs/_posts/2023-09-19-deepspeed4science-chinese.md b/docs/_posts/2023-09-19-deepspeed4science-chinese.md
new file mode 100644
index 000000000000..7b0ccf00aa61
--- /dev/null
+++ b/docs/_posts/2023-09-19-deepspeed4science-chinese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed4Science：利用先进的AI系统优化技术实现科学发现"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md
+date: 2023-09-19 00:00:00
+tags: training inference science Chinese
+---
diff --git a/docs/_posts/2023-09-19-deepspeed4science-japanese.md b/docs/_posts/2023-09-19-deepspeed4science-japanese.md
new file mode 100644
index 000000000000..8c0a1b6d0082
--- /dev/null
+++ b/docs/_posts/2023-09-19-deepspeed4science-japanese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed4Scienceイニシアティブ: 洗練されたAIシステムのテクノロジーにより大規模な科学的発見を可能に"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md
+date: 2023-09-19 00:00:00
+tags: training inference science Japanese
+---
diff --git a/docs/_posts/2023-09-19-deepspeed4science.md b/docs/_posts/2023-09-19-deepspeed4science.md
new file mode 100644
index 000000000000..faeaa1331944
--- /dev/null
+++ b/docs/_posts/2023-09-19-deepspeed4science.md
@@ -0,0 +1,7 @@
+---
+title: "Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies"
+excerpt: ""
+link: https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/
+date: 2023-09-19 00:00:00
+tags: training inference science English
+---
diff --git a/docs/_tutorials/ds4sci_evoformerattention.md b/docs/_tutorials/ds4sci_evoformerattention.md
new file mode 100644
index 000000000000..a623dd6aa2ca
--- /dev/null
+++ b/docs/_tutorials/ds4sci_evoformerattention.md
@@ -0,0 +1,74 @@
+---
+title: "DS4Sci_EvoformerAttention eliminates memory explosion problems for scaling Evoformer-centric structural biology models"
+tags: training inference
+---
+
+## 1. What is DS4Sci_EvoformerAttention
+`DS4Sci_EvoformerAttention` is a collection of kernels built to scale the [Evoformer](https://www.nature.com/articles/s41586-021-03819-2) computation to larger number of sequences and residuals by reducing the memory footprint and increasing the training speed.
+
+## 2. When to use DS4Sci_EvoformerAttention
+`DS4Sci_EvoformerAttention` is most beneficial when the number of sequences and residuals is large. The forward kernel is optimized to accelerate computation. It is beneficial to use the forward kernel during inference for various attention mechanisms. The associated backward kernel can be used during training to reduce the memory footprint at the cost of some computation. Therefore, it is beneficial to use `DS4Sci_EvoformerAttention` in training for memory-constrained operations such as MSA row-wise attention and MSA column-wise attention.
+
+## 3. How to use DS4Sci_EvoformerAttention
+
+### 3.1 Installation
+
+`DS4Sci_EvoformerAttention` is released as part of DeepSpeed >= 0.10.3. `DS4Sci_EvoformerAttention` is implemented based on [CUTLASS](https://github.com/NVIDIA/cutlass). You need to clone the CUTLASS repository and specify the path to it in the environment variable `CUTLASS_PATH`.
+
+```shell
+git clone https://github.com/NVIDIA/cutlass
+export CUTLASS_PATH=/path/to/cutlass
+```
+The kernels will be compiled when `DS4Sci_EvoformerAttention` is called for the first time.
+
+`DS4Sci_EvoformerAttention` requires GPUs with compute capability 7.0 or higher (NVIDIA V100 or later GPUs) and the minimal CUDA version is 11.3. It is recommended to use CUDA 11.7 or later for better performance. Besides, the performance of backward kernel on V100 kernel is not as good as that on A100 for now.
+
+### 3.2 Unit test and benchmark
+
+The unit test and benchmark are available in the `tests` folder in DeepSpeed repo. You can use the following command to run the unit test and benchmark.
+
+```shell
+pytest -s tests/unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py
+python tests/benchmarks/DS4Sci_EvoformerAttention_bench.py
+```
+
+### 3.3 Applying DS4Sci_EvoformerAttention to your own model
+
+To use `DS4Sci_EvoformerAttention` in user's own models, you need to import `DS4Sci_EvoformerAttention` from `deepspeed.ops.deepspeed4science`.
+
+```python
+from deepspeed.ops.deepspeed4science import DS4Sci_EvoformerAttention
+```
+
+`DS4Sci_EvoformerAttention` supports four attention mechanisms in Evoformer (MSA row-wise, MSA column-wise, and 2 kinds of Triangular) by using different inputs as shown in the following examples. In the examples, we denote the number of sequences as `N_seq` and the number of residuals as `N_res`. The dimension of the hidden states `Dim` and head number `Head` are different among different attention. Note that `DS4Sci_EvoformerAttention` requires the input tensors to be in `torch.float16` or `torch.bfloat16` data type.
+
+(a) **MSA row-wise attention** builds attention weights for residue pairs and integrates the information from the pair representation as an additional bias term.
+```python
+# Q, K, V: [Batch, N_seq, N_res, Head, Dim]
+# res_mask: [Batch, N_seq, 1, 1, N_res]
+# pair_bias: [Batch, 1, Head, N_res, N_res]
+out = DS4Sci_EvoformerAttention(Q, K, V, [res_mask, pair_bias])
+```
+
+(b) **MSA column-wise attention** lets the elements that belong to the same target residue exchange information.
+```python
+# Q, K, V: [Batch, N_res, N_seq, Head, Dim]
+# res_mask: [Batch, N_seq, 1, 1, N_res]
+out = DS4Sci_EvoformerAttention(Q, K, V, [res_mask])
+```
+
+(c) **Triangular self-attention** updates the pair representation. There are two kinds of Triangular self-attention: around starting and around ending node. Below is the example of triangular self-attention around starting node. The triangular self-attention around ending node is similar.
+```python
+# Q, K, V: [Batch, N_res, N_res, Head, Dim]
+# res_mask: [Batch, N_res, 1, 1, N_res]
+# right_edges: [Batch, 1, Head, N_res, N_res]
+out = DS4Sci_EvoformerAttention(Q, K, V, [res_mask, right_edges])
+```
+
+## 4. DS4Sci_EvoformerAttention scientific application
+
+### 4.1 DS4Sci_EvoformerAttention eliminates memory explosion problems for scaling Evoformer-centric structural biology models in OpenFold
+
+[OpenFold](https://github.com/aqlaboratory/openfold) is a community reproduction of DeepMind's AlphaFold2 that makes it possible to train or finetune AlphaFold2 on new datasets. Training AlphaFold2 incurs a memory explosion problem because it contains several custom Evoformer attention variants that manifest unusually large activations. By leveraging DeepSpeed4Science's DS4Sci_EvoformerAttention kernels, OpenFold team is able to reduce the peak memory requirement by 13x without accuracy loss. Detailed information about the methodology can be found at [our website](https://deepspeed4science.ai/2023/09/18/model-showcase-openfold/).
+
+<!-- OpenFold team also hosts an [example](https://github.com/aqlaboratory/openfold/blob/main/tests/test_deepspeed_evo_attention.py) about how to use DS4Sci_EvoformerAttention in the OpenFold repo. -->
diff --git a/docs/assets/images/3pillars.png b/docs/assets/images/3pillars.png
deleted file mode 100755
index c2943ca912a1..000000000000
Binary files a/docs/assets/images/3pillars.png and /dev/null differ
diff --git a/docs/assets/images/DeepSpeed-pillars.png b/docs/assets/images/DeepSpeed-pillars.png
new file mode 100644
index 000000000000..e41a02a86058
Binary files /dev/null and b/docs/assets/images/DeepSpeed-pillars.png differ
diff --git a/docs/assets/images/evoformer.png b/docs/assets/images/evoformer.png
new file mode 100755
index 000000000000..a3da3b18febd
Binary files /dev/null and b/docs/assets/images/evoformer.png differ
diff --git a/docs/assets/images/new-megatron-ds.png b/docs/assets/images/new-megatron-ds.png
new file mode 100755
index 000000000000..a8f408338afe
Binary files /dev/null and b/docs/assets/images/new-megatron-ds.png differ
diff --git a/docs/index.md b/docs/index.md
index b4ae1b84cdea..210e1494f7e2 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -7,11 +7,11 @@ title: "Latest News"
 ---
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
+* [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[DeepSpeed4Science website](https://deepspeed4science.ai/)] [[Tutorials](/deepspeed4science/)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]
 * [2023/08] [DeepSpeed ZeRO-Inference: 20X faster inference through weight quantization and KV cache offloading](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md)
 * [2023/08] [DeepSpeed-Chat: Llama/Llama-2 system support, efficiency boost, and training stability improvements](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31/README.md)
-* [2023/08] [DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses)
+* [2023/08] [DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses) [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/japanese/README.md)]
 * [2023/06] [ZeRO++: A leap in speed for LLM and chat model training with 4X less communication](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/)[[English](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/japanese/README.md)]
-* [2023/04] 🚀 [DeepSpeed Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/japanese/README.md)]🚀
 
 # Extreme Speed and Scale for DL Training and Inference
 
@@ -24,9 +24,9 @@ title: "Latest News"
 * Achieve extreme compression for an unparalleled inference latency and model size reduction with low costs
 
 
-# DeepSpeed has three innovation pillars:
+# DeepSpeed has four innovation pillars:
 
-![Three innovation pillars](/assets/images/3pillars.png){: .align-center}
+[![Four innovation pillars](/assets/images/DeepSpeed-pillars.png){: .align-center}](https://deepspeed4science.ai/)
 
 
 ## DeepSpeed-Training
@@ -41,6 +41,10 @@ DeepSpeed brings together innovations in parallelism technology such as tensor,
 
 To further increase the inference efficiency, DeepSpeed offers easy-to-use and flexible-to-compose compression techniques for researchers and practitioners to compress their models while delivering faster speed, smaller model size, and significantly reduced compression cost. Moreover, SoTA innovations on compression like ZeroQuant and XTC are included under the DeepSpeed-Compression pillar. Learn more: [DeepSpeed-Compression](https://www.deepspeed.ai/compression)
 
+## DeepSpeed4Science
+
+In line with Microsoft's mission to solve humanity's most pressing challenges, the DeepSpeed team at Microsoft is responding to this opportunity by launching a new initiative called *DeepSpeed4Science*, aiming to build unique capabilities through AI system technology innovations to help domain experts to unlock today's biggest science mysteries. Learn more: [DeepSpeed4Science website](https://deepspeed4science.ai/) and [tutorials](/deepspeed4science/)
+
 # DeepSpeed Software Suite
 
 ## DeepSpeed Library
diff --git a/environment.yml b/environment.yml
index ed51ed21afc8..e55fe96e5a5a 100644
--- a/environment.yml
+++ b/environment.yml
@@ -18,3 +18,4 @@ dependencies:
   - certifi
   - openssl
   - python=3.10
+  - pydantic<2.0.0
diff --git a/op_builder/evoformer_attn.py b/op_builder/evoformer_attn.py
new file mode 100644
index 000000000000..f4311848d0d4
--- /dev/null
+++ b/op_builder/evoformer_attn.py
@@ -0,0 +1,72 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import CUDAOpBuilder, installed_cuda_version
+import os
+
+
+class EvoformerAttnBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_EVOFORMER_ATTN"
+    NAME = "evoformer_attn"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+        self.cutlass_path = os.environ.get('CUTLASS_PATH')
+
+    def absolute_name(self):
+        return f'deepspeed.ops.{self.NAME}_op'
+
+    def extra_ldflags(self):
+        if not self.is_rocm_pytorch():
+            return ['-lcurand']
+        else:
+            return []
+
+    def sources(self):
+        src_dir = 'csrc/deepspeed4science/evoformer_attn'
+        return [f'{src_dir}/attention.cpp', f'{src_dir}/attention_back.cu', f'{src_dir}/attention.cu']
+
+    def nvcc_args(self):
+        args = super().nvcc_args()
+        try:
+            import torch
+        except ImportError:
+            self.warning("Please install torch if trying to pre-compile kernels")
+            return args
+        major = torch.cuda.get_device_properties(0).major  #ignore-cuda
+        minor = torch.cuda.get_device_properties(0).minor  #ignore-cuda
+        args.append(f"-DGPU_ARCH={major}{minor}")
+        return args
+
+    def is_compatible(self, verbose=True):
+        try:
+            import torch
+        except ImportError:
+            self.warning("Please install torch if trying to pre-compile kernels")
+            return False
+        if self.cutlass_path is None:
+            self.warning("Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH")
+            return False
+        with open(f'{self.cutlass_path}/CHANGELOG.md', 'r') as f:
+            if '3.1.0' not in f.read():
+                self.warning("Please use CUTLASS version >= 3.1.0")
+                return False
+        cuda_okay = True
+        if not self.is_rocm_pytorch() and torch.cuda.is_available():  #ignore-cuda
+            sys_cuda_major, _ = installed_cuda_version()
+            torch_cuda_major = int(torch.version.cuda.split('.')[0])
+            cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
+            if cuda_capability < 7:
+                self.warning("Please use a GPU with compute capability >= 7.0")
+                cuda_okay = False
+            if torch_cuda_major < 11 or sys_cuda_major < 11:
+                self.warning("Please use CUDA 11+")
+                cuda_okay = False
+        return super().is_compatible(verbose) and cuda_okay
+
+    def include_paths(self):
+        includes = [f'{self.cutlass_path}/include', f'{self.cutlass_path}/tools/util/include']
+        return includes
diff --git a/release/check_release_version.py b/release/check_release_version.py
new file mode 100644
index 000000000000..abf1e403f318
--- /dev/null
+++ b/release/check_release_version.py
@@ -0,0 +1,29 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import argparse
+from packaging import version as pkg_version
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--new_version", type=str, help="The new version being published.")
+
+args = parser.parse_args()
+
+new_version = pkg_version.parse(args.new_version)
+
+with open('./version.txt') as fd:
+    current_version = pkg_version.parse(fd.read())
+
+# Valid version are those where the major/minor/micro are incremented by no more than one from the existing release, and the less significant values are reset to 0.
+valid_major_update = pkg_version.Version(f'{current_version.major + 1}.0.0')
+valid_minor_update = pkg_version.Version(f'{current_version.major}.{current_version.minor + 1}.0')
+valid_micro_update = pkg_version.Version(
+    f'{current_version.major}.{current_version.minor}.{current_version.micro + 1}')
+
+valid_versions = [valid_major_update, valid_minor_update, valid_micro_update]
+
+if new_version not in valid_versions:
+    raise Exception(f'{new_version} is an invalid version. Valid versions are {valid_versions}.\n')
diff --git a/release/release.sh b/release/release.sh
index 3e14257374d1..a4d38674d48c 100644
--- a/release/release.sh
+++ b/release/release.sh
@@ -25,6 +25,13 @@ if [ "${version}" != `cat version.txt` ]; then
     exit 1
 fi
 
+echo "checking that the version is valid"
+python release/check_release_version.py --new_version ${version}
+if [ $? != 0 ]; then
+    echo 'please check the version number selected'
+    exit 1
+fi
+
 python -c "import twine"
 if [ $? != 0 ]; then
     echo 'please install twine via pip'
diff --git a/tests/benchmarks/DS4Sci_EvoformerAttention_bench.py b/tests/benchmarks/DS4Sci_EvoformerAttention_bench.py
new file mode 100644
index 000000000000..b19eae7272c4
--- /dev/null
+++ b/tests/benchmarks/DS4Sci_EvoformerAttention_bench.py
@@ -0,0 +1,107 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+This script is to test the correctness of the DS4Sci_EvoformerAttention op.
+To run the script,
+1. Clone the CUTLASS repo. E.g. git clone https://github.com/NVIDIA/cutlass.git
+2. Specify the CUTLASS_PATH environment variable. E.g. export CUTLASS_PATH=$(pwd)/cutlass
+3. Run the script. E.g. python DS4Sci_EvoformerAttention_bench.py
+"""
+
+import contextlib
+import torch
+from typing import List
+from torch.nn import functional as F
+from deepspeed.ops.deepspeed4science import DS4Sci_EvoformerAttention
+from deepspeed.accelerator import get_accelerator
+
+
+def attention_reference(
+        q_input: torch.Tensor,  # [*, Dim_Q, H, C_hid]
+        k_input: torch.Tensor,  # [*, Dim_Q, H, C_hid]
+        v_input: torch.Tensor,  # [*, Dim_Q, H, C_hid]
+        biases: List[torch.Tensor],
+        sm_scale: float) -> torch.Tensor:
+    # Original shape: [*, Dim_Q, H, C_hid] -> Transpose to: [*, H, Dim_Q, C_hid]
+    q = q_input.transpose(-2, -3)
+    k = k_input.transpose(-2, -3)
+    v = v_input.transpose(-2, -3)
+
+    # Now, q, k, v are in shape: [*, H, Dim_Q, C_hid]
+
+    # Transpose k to shape [*, H, C_hid, Dim_Q]
+    k_t = k.transpose(-1, -2)
+
+    # Now, q and k_t are in shapes: [*, H, Dim_Q, C_hid] and [*, H, C_hid, Dim_Q] respectively
+
+    # [*, H, Dim_Q, Dim_Q]
+    a = torch.matmul(q, k_t) * sm_scale
+
+    for b in biases:
+        a += b
+
+    a = F.softmax(a, dim=-1)
+
+    # Now, a is in shape [*, H, Dim_Q, Dim_Q], v is in shape [*, H, Dim_Q, C_hid]
+
+    # Matmul operation results in [*, H, Dim_Q, C_hid]
+    a_v = torch.matmul(a, v)
+
+    # [*, Dim_Q, H, C_hid]
+    o = a_v.transpose(-2, -3)
+
+    return o
+
+
+dtype = torch.float16
+
+N = 256
+heads = 4
+dim = 32
+seq_len = 256
+
+
+@contextlib.contextmanager
+def cuda_timer(res_list):
+    start = get_accelerator().Event(enable_timing=True)
+    end = get_accelerator().Event(enable_timing=True)
+    start.record()
+    yield
+    end.record()
+    get_accelerator().synchronize()
+    res_list.append(start.elapsed_time(end))
+
+
+def benchmark():
+    ours_fw = []
+    ours_bw = []
+    baseline_fw = []
+    baseline_bw = []
+    for batch in range(1, 17):
+        Q = torch.randn(batch, N, seq_len, heads, dim, dtype=dtype, device="cuda", requires_grad=True)
+        K = torch.randn(batch, N, seq_len, heads, dim, dtype=dtype, device="cuda", requires_grad=True)
+        V = torch.randn(batch, N, seq_len, heads, dim, dtype=dtype, device="cuda", requires_grad=True)
+        bias1 = torch.randn(batch, N, 1, 1, seq_len, dtype=dtype, device="cuda", requires_grad=True)
+        bias2 = torch.randn(batch, 1, heads, seq_len, seq_len, dtype=dtype, device="cuda", requires_grad=True)
+        # warm up
+        DS4Sci_EvoformerAttention(Q, K, V, [bias1, bias2])
+        with cuda_timer(ours_fw):
+            out = DS4Sci_EvoformerAttention(Q, K, V, [bias1, bias2])
+        d_out = torch.rand_like(out)
+        with cuda_timer(ours_bw):
+            out.backward(d_out)
+        # warm up
+        attention_reference(Q, K, V, [bias1, bias2], 1 / (dim**0.5))
+        with cuda_timer(baseline_fw):
+            ref_out = attention_reference(Q, K, V, [bias1, bias2], 1 / (dim**0.5))
+        with cuda_timer(baseline_bw):
+            ref_out.backward(d_out)
+
+    print(f"batch size\tours (FW)\tbaseline (FW)\tours (BW)\tbaseline (BW)")
+    for i in range(len(ours_fw)):
+        print(f"{i+1}\t{ours_fw[i]}\t{baseline_fw[i]}\t{ours_bw[i]}\t{baseline_bw[i]}")
+
+
+benchmark()
diff --git a/tests/unit/compression/test_dequantization.py b/tests/unit/compression/test_dequantization.py
new file mode 100644
index 000000000000..692f4cef97d7
--- /dev/null
+++ b/tests/unit/compression/test_dequantization.py
@@ -0,0 +1,38 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Copyright (c) 2023, 2023, Oracle and/or its affiliates.
+
+import os
+import torch
+from unit.common import DistributedTest
+from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.accelerator import get_accelerator
+
+
+class TestDequantization(DistributedTest):
+
+    def init(self):
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        self.device = torch.device(get_accelerator().device_name(local_rank))
+
+        self.dequantize_func = InferenceBuilder().load().dequantize_fp16
+
+    def run_dequantize_test(self, M, N, num_groups):
+        weight = torch.randint(-255, 255, (M, N)).to(dtype=torch.int8, device=self.device)
+        scale = torch.rand(num_groups, 1).to(device=self.device)
+
+        weight_deq = (weight.reshape(num_groups, -1) * scale).reshape(M, N).to(torch.float16).contiguous()
+        weight_deq_backend = self.dequantize_func(weight, scale, num_groups)
+
+        assert torch.allclose(weight_deq, weight_deq_backend)
+
+    def test_dequantize(self):
+        self.init()
+
+        self.run_dequantize_test(14336, 7168, 32)
+        self.run_dequantize_test(14336, 1792, 32)
+        self.run_dequantize_test(768, 768, 32)
+        self.run_dequantize_test(768, 768, 48)
diff --git a/tests/unit/inference/quantization/test_int4_quantization.py b/tests/unit/inference/quantization/test_int4_quantization.py
index dd10ee0c0647..56a5a7d48382 100644
--- a/tests/unit/inference/quantization/test_int4_quantization.py
+++ b/tests/unit/inference/quantization/test_int4_quantization.py
@@ -11,18 +11,17 @@
 from deepspeed.inference.quantization.quantization import _init_group_wise_weight_quantization
 from deepspeed.inference.quantization.utils import Quantizer, DeQuantizer
 from deepspeed.inference.quantization.layers import QuantizedLinear
+from deepspeed.runtime.utils import required_torch_version
 from transformers.models.opt.modeling_opt import OPTDecoderLayer
 from transformers import AutoConfig, OPTConfig, AutoModel
 import pytest
 from collections import OrderedDict
 from typing import Dict
 
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
 device = get_accelerator().device_name() if get_accelerator().is_available() else 'cpu'
 
-if (TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 10)):
-    pytest.skip("torch.Tensor.bitwise_left_shift in INT4 quantizer needs torch 1.10 or above.",
+if not required_torch_version(min_version=1.11):
+    pytest.skip("torch.Tensor.bitwise_left_shift in INT4 quantizer needs torch 1.11 or above.",
                 allow_module_level=True)
 
 
diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
index e591a214c3f7..4ee3cd73c045 100644
--- a/tests/unit/inference/test_inference.py
+++ b/tests/unit/inference/test_inference.py
@@ -68,7 +68,7 @@
 ]
 
 # Get a list of all models and mapping from task to supported models
-_hf_models = HfApi().list_models()
+_hf_models = list(HfApi().list_models())
 _hf_model_names = [m.modelId for m in _hf_models]
 _hf_task_to_models = {task: [m.modelId for m in _hf_models if m.pipeline_tag == task] for task in _test_tasks}
 
@@ -257,6 +257,10 @@ def validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton):
         msg = "triton needs to be installed for the test"
     elif ("bert" not in model.lower()) and enable_triton:
         msg = "Triton kernels do not support Non bert/roberta models yet"
+
+    # These should be removed once we fix several inference tests failing
+    if model in ["EleutherAI/pythia-70m-deduped", "distilbert-base-cased-distilled-squad", "EleutherAI/gpt-j-6b"]:
+        msg = "Test is currently broken"
     return msg
 
 
diff --git a/tests/unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py b/tests/unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py
new file mode 100644
index 000000000000..f8cd46e29228
--- /dev/null
+++ b/tests/unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py
@@ -0,0 +1,110 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import List
+
+import pytest
+import torch
+from torch.nn import functional as F
+import deepspeed
+from deepspeed.ops.op_builder import EvoformerAttnBuilder
+from deepspeed.ops.deepspeed4science import DS4Sci_EvoformerAttention
+from deepspeed.accelerator import get_accelerator
+from unit.util import skip_on_arch
+
+if not deepspeed.ops.__compatible_ops__[EvoformerAttnBuilder.NAME]:
+    pytest.skip("DS4Sci_EvoformerAttention ops are not available on this system", allow_module_level=True)
+
+
+def attention_reference(
+        q_input: torch.Tensor,  # [*, Dim_Q, H, C_hid]
+        k_input: torch.Tensor,  # [*, Dim_Q, H, C_hid]
+        v_input: torch.Tensor,  # [*, Dim_Q, H, C_hid]
+        biases: List[torch.Tensor],
+        sm_scale: float) -> torch.Tensor:
+    q = q_input.transpose(-2, -3)
+    k = k_input.transpose(-2, -3)
+    v = v_input.transpose(-2, -3)
+    k_t = k.transpose(-1, -2)
+    a = torch.matmul(q, k_t) * sm_scale
+
+    for b in biases:
+        a += b
+
+    a = F.softmax(a, dim=-1)
+    a_v = torch.matmul(a, v)
+    o = a_v.transpose(-2, -3)
+
+    return o
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("tensor_shape", [(1, 256, 256, 4, 32), (1, 512, 256, 8, 8)])
+def test_DS4Sci_EvoformerAttention(dtype, tensor_shape):
+    skip_on_arch(8 if dtype == torch.bfloat16 else 7)
+    batch, n, seq_len, heads, dim = tensor_shape
+    Q = torch.randn(batch,
+                    n,
+                    seq_len,
+                    heads,
+                    dim,
+                    dtype=dtype,
+                    device=get_accelerator().device_name(),
+                    requires_grad=True)
+    K = torch.randn(batch,
+                    n,
+                    seq_len,
+                    heads,
+                    dim,
+                    dtype=dtype,
+                    device=get_accelerator().device_name(),
+                    requires_grad=True)
+    V = torch.randn(batch,
+                    n,
+                    seq_len,
+                    heads,
+                    dim,
+                    dtype=dtype,
+                    device=get_accelerator().device_name(),
+                    requires_grad=True)
+    bias1 = torch.randn(batch,
+                        n,
+                        1,
+                        1,
+                        seq_len,
+                        dtype=dtype,
+                        device=get_accelerator().device_name(),
+                        requires_grad=True)
+    bias2 = torch.randn(batch,
+                        1,
+                        heads,
+                        seq_len,
+                        seq_len,
+                        dtype=dtype,
+                        device=get_accelerator().device_name(),
+                        requires_grad=True)
+    dummy_out = torch.rand_like(Q, dtype=dtype, device=get_accelerator().device_name())
+    ref_out = attention_reference(Q, K, V, [bias1, bias2], 1 / (dim**0.5))
+    ref_out.backward(dummy_out)
+    ref_dv, V.grad = V.grad.clone(), None
+    ref_dk, K.grad = K.grad.clone(), None
+    ref_dq, Q.grad = Q.grad.clone(), None
+    ref_db1, bias1.grad = bias1.grad.clone(), None
+    ref_db2, bias2.grad = bias2.grad.clone(), None
+
+    out = DS4Sci_EvoformerAttention(Q, K, V, [bias1, bias2])
+    out.backward(dummy_out)
+    dv, v_grad = V.grad.clone(), None
+    dk, k_grad = K.grad.clone(), None
+    dq, q_grad = Q.grad.clone(), None
+    db1, bias1.grad = bias1.grad.clone(), None
+    db2, bias2.grad = bias2.grad.clone(), None
+
+    assert torch.allclose(ref_out, out, atol=2e-2, rtol=0), f"\n{ref_out} \n {out}"
+    assert torch.allclose(ref_dv, dv, atol=2e-2, rtol=0), f"\n{ref_dv} \n {dv}"
+    assert torch.allclose(ref_dk, dk, atol=2e-2, rtol=0), f"\n{ref_dk} \n {dk}"
+    assert torch.allclose(ref_dq, dq, atol=2e-2, rtol=0), f"\n{ref_dq} \n {dq}"
+    assert torch.allclose(ref_db1, db1, atol=2e-2, rtol=1e-2), f"{ref_db1} \n {db1}"
+    assert torch.allclose(ref_db2, db2, atol=2e-2, rtol=1e-2), f"{ref_db2} \n {db2}"
diff --git a/tests/unit/ops/transformer/inference/inference_test_utils.py b/tests/unit/ops/transformer/inference/inference_test_utils.py
index fb9518f6a39c..9c7b428c0e68 100644
--- a/tests/unit/ops/transformer/inference/inference_test_utils.py
+++ b/tests/unit/ops/transformer/inference/inference_test_utils.py
@@ -52,3 +52,14 @@ def assert_almost_equal(x, y, decimal=2, err_msg=''):
             y = y.float()
         y = y.cpu().detach().numpy()
     npt.assert_array_almost_equal(x, y, err_msg=err_msg, decimal=decimal)
+
+
+def max_diff(a, b):
+    a = a.to(torch.float32).flatten()
+    b = b.to(torch.float32).flatten()
+    diff = torch.abs(a - b)
+    max_diff_indices = torch.argsort(diff)[-1]
+    print("Max difference indices:", max_diff_indices)
+    print("Max difference values:", diff[max_diff_indices])
+    print(f"{a[max_diff_indices]} vs {b[max_diff_indices]}")
+    return max_diff_indices
diff --git a/tests/unit/ops/transformer/inference/test_attention.py b/tests/unit/ops/transformer/inference/test_attention.py
index db4221305a51..13abe8b915c7 100644
--- a/tests/unit/ops/transformer/inference/test_attention.py
+++ b/tests/unit/ops/transformer/inference/test_attention.py
@@ -6,6 +6,7 @@
 import pytest
 import torch
 import deepspeed
+from deepspeed.accelerator import get_accelerator
 from .inference_test_utils import assert_almost_equal
 
 
@@ -19,54 +20,72 @@ def ref_torch_attention(q, k, v, mask, sm_scale):
 
 # test attention operator
 @pytest.mark.inference_ops
-@pytest.mark.parametrize("Z", [1])  # batch
+@pytest.mark.parametrize("BATCH", [1])  # batch
 @pytest.mark.parametrize("H", [12])  # heads
-@pytest.mark.parametrize("N_CTX", [4, 128])  # sequence length
+@pytest.mark.parametrize("N_CTX", [16, 128])  # sequence length
 @pytest.mark.parametrize("D_HEAD", [64, 128])
 @pytest.mark.parametrize("causal", [True, False])
-def test_attention(Z, H, N_CTX, D_HEAD, causal, dtype=torch.float16):
+@pytest.mark.parametrize("use_flash", [True, False])
+def test_attention(BATCH, H, N_CTX, D_HEAD, causal, use_flash, dtype=torch.float16):
     if not deepspeed.HAS_TRITON:
         pytest.skip("triton has to be installed for the test")
 
+    minus_inf = -65504.0
+
     # skip autotune in testing
     from deepspeed.ops.transformer.inference.triton.matmul_ext import fp16_matmul
     fp16_matmul.skip_autotune()
 
-    from deepspeed.ops.transformer.inference.triton.attention import compute_attention
+    from deepspeed.ops.transformer.inference.triton.attention import _triton_attention, _triton_packed_flash
     torch.manual_seed(20)
-    q = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5)
-    k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5)
-    v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5)
+    q = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5)
+    k = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5)
+    v = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5)
     sm_scale = 0.3
 
     # reference implementation
     p = torch.matmul(q, k.transpose(2, 3)) * sm_scale
     score = p
-    mask = torch.zeros((Z, H, N_CTX, N_CTX), dtype=dtype, device="cuda")
+    mask = torch.zeros((BATCH, H, N_CTX, N_CTX), dtype=dtype, device="cuda")
     M = torch.tril(torch.ones((N_CTX, N_CTX), device="cuda"))
     if causal:
-        for z in range(Z):
+        for z in range(BATCH):
             for h in range(H):
-                mask[:, :, M == 0] = float("-inf")
+                mask[:, :, M == 0] = minus_inf
     p = torch.softmax(p.float() + mask, dim=-1).half()
     softmax_out = p
     ref_out = torch.matmul(p, v)
     context = ref_out
 
     # adjust it to expected tensor format and run test
-    qkv = torch.randn((Z, N_CTX, 3 * H * D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
-    qkv[:, :, :H * D_HEAD] = q.permute(0, 2, 1, 3).contiguous().reshape((Z, N_CTX, H * D_HEAD))
-    qkv[:, :, 1 * H * D_HEAD:2 * H * D_HEAD] = k.permute(0, 2, 1, 3).contiguous().reshape((Z, N_CTX, H * D_HEAD))
-    qkv[:, :, 2 * H * D_HEAD:] = v.permute(0, 2, 1, 3).contiguous().reshape((Z, N_CTX, H * D_HEAD))
-    tri_out = compute_attention(qkv,
-                                input_mask=mask,
-                                layer_past=None,
-                                alibi=None,
-                                scale=sm_scale,
-                                head_size=D_HEAD,
-                                triangular=False,
-                                use_cuda_flash=False,
-                                use_triton_flash=False,
-                                use_ds_attention=False)
-    tri_out = tri_out.reshape((Z, N_CTX, H, D_HEAD)).permute(0, 2, 1, 3)
+    qkv = torch.randn((BATCH, N_CTX, 3 * H * D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+    qkv[:, :, :H * D_HEAD] = q.permute(0, 2, 1, 3).contiguous().reshape((BATCH, N_CTX, H * D_HEAD))
+    qkv[:, :, 1 * H * D_HEAD:2 * H * D_HEAD] = k.permute(0, 2, 1, 3).contiguous().reshape((BATCH, N_CTX, H * D_HEAD))
+    qkv[:, :, 2 * H * D_HEAD:] = v.permute(0, 2, 1, 3).contiguous().reshape((BATCH, N_CTX, H * D_HEAD))
+
+    if use_flash:
+        if not get_accelerator().is_triton_supported():
+            pytest.skip("triton flash attention is supported when the compute capability > 8.0")
+        triton_mask = torch.zeros((BATCH, 1, 1, N_CTX), dtype=dtype, device="cuda")
+        if not causal:
+            lengths = torch.randint(N_CTX - 8, N_CTX, (BATCH, 1), device='cuda')
+            for i, l in enumerate(lengths):
+                triton_mask[i, ..., l:] = minus_inf
+            mask = torch.zeros((BATCH, H, N_CTX, N_CTX), dtype=dtype, device="cuda")
+            for b in range(BATCH):
+                mask[b, :, :, lengths[b]:] = minus_inf
+            ref_out = ref_torch_attention(q, k, v, mask, sm_scale)
+        tri_out = _triton_packed_flash(qkv, D_HEAD, triton_mask, sm_scale, causal=causal, add_mask=(not causal))
+    else:
+        tri_out = _triton_attention(qkv,
+                                    input_mask=mask,
+                                    layer_past=None,
+                                    alibi=None,
+                                    scale=sm_scale,
+                                    head_size=D_HEAD,
+                                    triangular=False,
+                                    use_cuda_flash=False,
+                                    use_triton_flash=False,
+                                    use_ds_attention=False)
+    tri_out = tri_out.reshape((BATCH, N_CTX, H, D_HEAD)).permute(0, 2, 1, 3)
     assert_almost_equal(ref_out, tri_out)