diff --git a/.gitmodules b/.gitmodules
index c5e0419..91c7e94 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,4 +4,8 @@
 
 [submodule "lightseq"]
 	path = lightseq
-	url = https://github.com/thu-coai/lightseq-nat
\ No newline at end of file
+	url = https://github.com/thu-coai/lightseq-nat
+
+[submodule "cub"]
+	path = cub
+	url = https://github.com/NVIDIA/cub
\ No newline at end of file
diff --git a/README.md b/README.md
index 5bfffda..4c12da6 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ This repo is modified from [``fairseq:5175fd``](https://github.com/pytorch/fairs
 ## Requirements & Installation
 
 * Python >= 3.7
-* Pytorch == 1.10.1
+* Pytorch == 1.10.1 (tested with cuda == 10.2 or 11.3)
 * gcc >= 7.0.0 (for compiling cuda operations. See FAQs if you want to use a lower gcc version)
 * ``git clone --recurse-submodules https://github.com/thu-coai/DA-Transformer.git && pip install -e .``
 * (Optional) Customized LightSeq for NAT (``cd lightseq && pip install -e .``)
@@ -43,6 +43,7 @@ Most codes of the framework are from Fairseq. We mainly add the following files.
 fs_plugins
 ├── criterions
 │   └── nat_dag_loss.py                   # DA-Transformer loss
+├── cub                                   # Requirements: Nvidia CUDA programming model
 ├── custom_ops                            # operations implementations and cuda kernels
 │   ├── dag_best_alignment.cu
 │   ├── logsoftmax_gather.cu
diff --git a/cub b/cub
new file mode 160000
index 0000000..618a46c
--- /dev/null
+++ b/cub
@@ -0,0 +1 @@
+Subproject commit 618a46c27764f0e0b86fb3643a572ed039180ad8
diff --git a/fs_plugins/custom_ops/dag_loss.py b/fs_plugins/custom_ops/dag_loss.py
index 612324e..fe98a2e 100644
--- a/fs_plugins/custom_ops/dag_loss.py
+++ b/fs_plugins/custom_ops/dag_loss.py
@@ -16,6 +16,7 @@
 
 import os
 import math
+import sys
 
 import torch
 from torch import nn, Tensor
@@ -39,6 +40,7 @@ def get_dag_kernel():
     if dag_kernel is not None:
         return dag_kernel
     else:
+        print("Start compiling cuda operations for DA-Transformer...", file=sys.stderr, flush=True)
         dag_kernel = load(
             "dag_loss_fn",
             sources=[
@@ -47,9 +49,11 @@ def get_dag_kernel():
                 os.path.join(module_path, "dag_best_alignment.cu"),
                 os.path.join(module_path, "logsoftmax_gather.cu"),
             ],
-            extra_cflags=['-DOF_SOFTMAX_USE_FAST_MATH'],
-            extra_cuda_cflags=['-DOF_SOFTMAX_USE_FAST_MATH'],
+            extra_cflags=['-DOF_SOFTMAX_USE_FAST_MATH', '-O3'],
+            extra_cuda_cflags=['-DOF_SOFTMAX_USE_FAST_MATH', '-O3'],
+            extra_include_paths=[os.path.join(module_path, "../../cub")],
         )
+        print("Cuda operations compiling finished", file=sys.stderr, flush=True)
         return dag_kernel
 
 class DagLossFunc(Function):
diff --git a/setup.py b/setup.py
index 196009c..164624d 100644
--- a/setup.py
+++ b/setup.py
@@ -224,6 +224,7 @@ def do_setup(package_data):
             "sacrebleu[ja]",
             "tqdm",
             "bitarray",
+            "ninja"
         ],
         dependency_links=dependency_links,
         packages=find_packages(