microsoft · Niupple · Nov 25, 2020 · Nov 25, 2020 · Nov 25, 2020 · Nov 26, 2020
diff --git a/src/nnfusion/core/kernels/cpu/reference/constant.cpp b/src/nnfusion/core/kernels/cpu/reference/constant.cpp
@@ -71,4 +71,4 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("Constant",                                       //op_name
                         Device(GENERIC_CPU).TypeConstraint(element::f32), //attrs
-                        cpu::Constant)                                    // constructor
+                        cpu::Constant)                                    // constructor
diff --git a/src/nnfusion/core/kernels/cpu/reference/variable.cpp b/src/nnfusion/core/kernels/cpu/reference/variable.cpp
@@ -69,4 +69,4 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("Variable",                                       //op_name
                         Device(GENERIC_CPU).TypeConstraint(element::f32), //attrs
-                        cpu::Variable)                                    // constructor
+                        cpu::Variable)                                    // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp
@@ -33,6 +33,11 @@ LanguageUnit_p cuda::get_math_kernel(const std::string& name,
         writer << ")\n";
         writer << "{\n";
         writer.indent++;
+        if (name == "convert" && data_types[num_inputs] == "half" && data_types[0] == "int64_t")
+        {
+            writer << "return (long long)" + math_kernel << ";\n";
+        }
+        else
         {
             writer << "return " + math_kernel << ";\n";
         }

diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp
@@ -108,4 +108,4 @@ using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER(
     "ApplyAdam",
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-    cuda::ApplyAdam)
+    cuda::ApplyAdam)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp
@@ -8,6 +8,7 @@
 //   [a] ./new_kernel_0.cpp
 //   [b] ../../../ops/op_define/new_op_0.cpp
 
+#include <string>
 #include "../cuda_emitter.hpp"
 #include "../cuda_langunit.hpp"
 #include "nnfusion/core/operators/generic_op/generic_op.hpp"
@@ -52,6 +53,15 @@ namespace nnfusion
                     const nnfusion::Shape& input_shape_0 = m_context->inputs[0]->get_shape();
                     const nnfusion::Shape& input_shape_1 = m_context->inputs[1]->get_shape();
 
+                    element::Type dtype0 = m_context->inputs[0]->get_element_type();
+                    element::Type dtype1 = m_context->inputs[1]->get_element_type();
+                    element::Type dtype2 = m_context->outputs[0]->get_element_type();
+                    NNFUSION_CHECK(dtype0 == dtype1 && dtype1 == dtype2)
+                        << "Unsupported element type combination of (" << dtype0.c_type_string()
+                        << ", " << dtype1.c_type_string() << ") -> " << dtype2.c_type_string()
+                        << ".";
+                    element::Type& dtype = dtype0;
+
                     bool transA = generic_op->localOpConfig.getRoot()["adj_x"]["b"];
                     bool transB = generic_op->localOpConfig.getRoot()["adj_y"]["b"];
                     size_t A1 = 1LU;
@@ -92,10 +102,11 @@ namespace nnfusion
                         stride_b = A2 * A3, ldc = A4, stride_c = A2 * A4;
                     }
 
+                    std::string type = dtype.c_type_string();
                     float alpha = 1.0f, beta = 0.0f;
                     auto code = nnfusion::op::create_code_from_template(
                         R"(
-                        static const float alpha = @alpha@F, beta = @beta@F;
+                        static const @dtype@ alpha = @alpha@, beta = @beta@;
                         // if (!@hCublas@)
                         //     CUBLAS_SAFE_CALL(@api_create@(&@hCublas@));
                         CUBLAS_SAFE_CALL(@api_exec@(
@@ -106,7 +117,9 @@ namespace nnfusion
                         {
                             {"hCublas", "cublas_handle"},
                             {"api_create", "cublasCreate"},
-                            {"api_exec", "cublasSgemmStridedBatched"},
+                            {"api_exec",
+                             dtype == element::f32 ? "cublasSgemmStridedBatched"
+                                                   : "cublasHgemmStridedBatched"},
                             {"transA", transB ? "CUBLAS_OP_T" : "CUBLAS_OP_N"},
                             {"transB", transA ? "CUBLAS_OP_T" : "CUBLAS_OP_N"},
                             {"alpha", alpha},
@@ -121,6 +134,7 @@ namespace nnfusion
                             {"stride_b", stride_b},
                             {"stride_c", stride_c},
                             {"batch", A1},
+                            {"dtype", type},
                         });
 
                     LanguageUnit_p _lu(new LanguageUnit(get_function_name()));

diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp
@@ -121,4 +121,4 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("Constant",                                                //op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs
-                        cuda::Constant)                                            // constructor
+                        cuda::Constant)                                            // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
@@ -86,7 +86,7 @@ LanguageUnit_p cuda::Dot::emit_function_body()
         // matrix * vector
         else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1))
         {
-            lu << "const float alpha = 1.0;\n const float beta = 0;\n";
+            lu << "const float alpha = 1.0;\n const float beta = 0.;\n";
             lu << "CUBLAS_SAFE_CALL(cublasSgemv(cublas_handle, ";
             if (trans_A)
                 lu << "CUBLAS_OP_N, " << arg0_shape[0] << ", " << arg0_shape[1] << ", ";
@@ -107,7 +107,7 @@ LanguageUnit_p cuda::Dot::emit_function_body()
             int n = trans_A ? arg0_shape[1] : arg0_shape[0];
             int k = trans_A ? arg0_shape[0] : arg0_shape[1];
 
-            lu << "const float alpha = 1.0;\nconst float beta = 0;\n";
+            lu << "const float alpha = 1.0;\nconst float beta = 0.;\n";
 
             lu << "CUBLAS_SAFE_CALL(cublasSgemm(cublas_handle,"
                << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,")
@@ -186,7 +186,7 @@ LanguageUnit_p cuda::Dot::emit_function_body()
                 }
             }
 
-            lu << "const float alpha = 1.0;\nconst float beta = 0;\n";
+            lu << "const float alpha = 1.0;\nconst float beta = 0.;\n";
 
             lu << "CUBLAS_SAFE_CALL(cublasSgemm(cublas_handle,"
                << " CUBLAS_OP_N,"
@@ -206,162 +206,109 @@ LanguageUnit_p cuda::Dot::emit_function_body()
     }
     else if (dtype == element::f16)
     {
-        // case 1: Scalar * Tensor
-        // if (arg0_shape.empty() || arg1_shape.empty())
-        // {
-        //     auto& second = (arg0_shape.empty() ? arg1_shape : arg0_shape);
-        //     size_t count = nnfusion::shape_size(second);
-
-        //     string firstarg = (arg0_shape.empty() ? "input1" : "input0");
-        //     string secondarg = (arg0_shape.empty() ? "input0" : "input1");
-
-        //     lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
-
-        //     lu << "CUDA_SAFE_CALL(cudaMemcpy(outupt0, " << firstarg << ", " << count << ", cudaMemcpyDeviceToDevice));\n";     // copy `firstarg` to `output0`
-        //     lu << "CUBLAS_SAFE_CALL(nnfusionHalfScale(" << secondarg << ", output0, " << count << "));\n";
-        // }
-        // // case 2: 1d Dot
-        // else if ((arg0_shape.size() == arg1_shape.size()) && (arg0_shape.size() == reduction_axes))
-        // {
-        //     for (int i = 0; i < arg0_shape.size(); i++)
-        //     {
-        //         if (arg0_shape[i] != arg1_shape[i])
-        //         {
-        //             std::vector<std::string> arg_vec{"arg0", "arg1"};
-        //             std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
-
-        //             NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-        //                                 << nnfusion::join(shape_vec) << " respectively, at Node "
-        //                                 << m_context->gnode->get_name()
-        //                                 << ", do not match for dot op";
-        //         }
-        //     }
-
-        //     size_t count = nnfusion::shape_size(arg0_shape);
-        //     lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
-
-        //     lu << "CUBLAS_SAFE_CALL(cublasSdot(cublas_handle, " << count
-        //     << ", static_cast<const float*>(input0), 1, static_cast<const float*>(input1), 1, "
-        //         "static_cast<float*>(output0)));\n";
-        // }
-        // // matrix * vector
-        // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1))
-        // {
-        //     lu << "const float alpha = 1.0;\n const float beta = 0;\n";
-        //     lu << "CUBLAS_SAFE_CALL(cublasSgemv(cublas_handle, ";
-        //     if (trans_A)
-        //         lu << "CUBLAS_OP_N, " << arg0_shape[0] << ", " << arg0_shape[1] << ", ";
-        //     else
-        //         lu << "CUBLAS_OP_T, " << arg0_shape[1] << ", " << arg0_shape[0] << ", ";
-        //     lu << " &alpha,"
-        //     << " static_cast<const float*>(input0)," << arg0_shape[1] << ", "
-        //     << " static_cast<const float*>(input1),"
-        //     << " 1,"
-        //     << " &beta,"
-        //     << " static_cast<float*>(output0),"
-        //     << " 1));\n";
-        // }
-        // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) &&
-        //         (trans_A || trans_B))
-        // {
-        //     int m = trans_B ? arg1_shape[0] : arg1_shape[1];
-        //     int n = trans_A ? arg0_shape[1] : arg0_shape[0];
-        //     int k = trans_A ? arg0_shape[0] : arg0_shape[1];
-
-        //     lu << "const half alpha = 1.0;\nconst half beta = 0;\n";
-
-        //     lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
-        //     << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,")
-        //     << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << ","
-        //     << " " << n << ","
-        //     << " " << k << ","
-        //     << " &alpha,"
-        //     << " static_cast<const half*>(input1),"
-        //     << " " << arg1_shape[1] << ","
-        //     << " static_cast<const half*>(input0),"
-        //     << " " << arg0_shape[1] << ","
-        //     << " &beta,"
-        //     << " static_cast<half*>(output0),"
-        //     << " " << m << "));\n";
-        // } else {
-        size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
-        size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
-        size_t axes_for_k_count = reduction_axes;
-        size_t m = 1;
-        size_t n = 1;
-        size_t k = 1;
-
-        // check if input and output size correct
-        // check and calculate k for arg0 and arg1
-        size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
-        size_t arg1_k_idx = 0;                // first axe in arg1 for k
-
-        for (size_t i = 0; i < axes_for_k_count; i++)
+        if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) &&
+            (trans_A || trans_B))
         {
-            k *= arg0_shape[arg0_k_idx];
-            if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
-            {
-                std::vector<std::string> arg_vec{"arg0", "arg1"};
-                std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
+            int m = trans_B ? arg1_shape[0] : arg1_shape[1];
+            int n = trans_A ? arg0_shape[1] : arg0_shape[0];
+            int k = trans_A ? arg0_shape[0] : arg0_shape[1];
 
-                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                      << nnfusion::join(shape_vec) << " respectively, at Node "
-                                      << m_context->gnode->get_name()
-                                      << ", do not match for dot op";
-            }
+            lu << "const half alpha = 1.0;\nconst half beta = 0.;\n";
+
+            lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
+               << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,")
+               << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << ","
+               << " " << n << ","
+               << " " << k << ","
+               << " &alpha,"
+               << " static_cast<const half*>(input1),"
+               << " " << arg1_shape[1] << ","
+               << " static_cast<const half*>(input0),"
+               << " " << arg0_shape[1] << ","
+               << " &beta,"
+               << " static_cast<half*>(output0),"
+               << " " << m << "));\n";
         }
-        // check and calculate m for arg0 and out
-        size_t arg0_m_idx = 0; // first axe in arg0 for m
-        size_t out_m_idx = 0;  // first axe in out for m
-        for (size_t i = 0; i < axes_for_m_count; i++)
+        else
         {
-            m *= arg0_shape[arg0_m_idx];
-            if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
+            size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
+            size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
+            size_t axes_for_k_count = reduction_axes;
+            size_t m = 1;
+            size_t n = 1;
+            size_t k = 1;
+
+            // check if input and output size correct
+            // check and calculate k for arg0 and arg1
+            size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
+            size_t arg1_k_idx = 0;                // first axe in arg1 for k
+
+            for (size_t i = 0; i < axes_for_k_count; i++)
             {
-                std::vector<std::string> arg_vec{"arg0", "output"};
-                std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
+                k *= arg0_shape[arg0_k_idx];
+                if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
+                {
+                    std::vector<std::string> arg_vec{"arg0", "arg1"};
+                    std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
 
-                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                      << nnfusion::join(shape_vec) << " respectively, at Node "
-                                      << m_context->gnode->get_name()
-                                      << ", do not match for dot op";
+                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                          << nnfusion::join(shape_vec) << " respectively, at Node "
+                                          << m_context->gnode->get_name()
+                                          << ", do not match for dot op";
+                }
             }
-        }
-        // check and calculate n for arg1 and out
-        size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
-        size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
-        for (size_t i = 0; i < axes_for_n_count; i++)
-        {
-            n *= arg1_shape[arg1_n_idx];
-            if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
+            // check and calculate m for arg0 and out
+            size_t arg0_m_idx = 0; // first axe in arg0 for m
+            size_t out_m_idx = 0;  // first axe in out for m
+            for (size_t i = 0; i < axes_for_m_count; i++)
             {
-                std::vector<std::string> arg_vec{"arg1", "output"};
-                std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
+                m *= arg0_shape[arg0_m_idx];
+                if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
+                {
+                    std::vector<std::string> arg_vec{"arg0", "output"};
+                    std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
 
-                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                      << nnfusion::join(shape_vec) << " respectively, at Node "
-                                      << m_context->gnode->get_name()
-                                      << ", do not match for dot op";
+                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                          << nnfusion::join(shape_vec) << " respectively, at Node "
+                                          << m_context->gnode->get_name()
+                                          << ", do not match for dot op";
+                }
             }
-        }
+            // check and calculate n for arg1 and out
+            size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
+            size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
+            for (size_t i = 0; i < axes_for_n_count; i++)
+            {
+                n *= arg1_shape[arg1_n_idx];
+                if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
+                {
+                    std::vector<std::string> arg_vec{"arg1", "output"};
+                    std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
 
-        lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
-
-        lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
-           << " CUBLAS_OP_N,"
-           << " CUBLAS_OP_N,"
-           << " " << n << ","
-           << " " << m << ","
-           << " " << k << ","
-           << " &alpha,"
-           << " static_cast<const half*>(input1),"
-           << " " << n << ","
-           << " static_cast<const half*>(input0),"
-           << " " << k << ","
-           << " &beta,"
-           << " static_cast<half*>(output0),"
-           << " " << n << "));\n";
-        // }
+                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                          << nnfusion::join(shape_vec) << " respectively, at Node "
+                                          << m_context->gnode->get_name()
+                                          << ", do not match for dot op";
+                }
+            }
+
+            lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
+
+            lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
+               << " CUBLAS_OP_N,"
+               << " CUBLAS_OP_N,"
+               << " " << n << ","
+               << " " << m << ","
+               << " " << k << ","
+               << " &alpha,"
+               << " static_cast<const half*>(input1),"
+               << " " << n << ","
+               << " static_cast<const half*>(input0),"
+               << " " << k << ","
+               << " &beta,"
+               << " static_cast<half*>(output0),"
+               << " " << n << "));\n";
+        }
     }
     else
     {

diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp
@@ -124,4 +124,4 @@ LanguageUnit_p cuda::DynamicStitch::emit_dependency()
 REGISTER_KERNEL_EMITTER(
     "DynamicStitch",                                                              // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::DynamicStitch)                                                          // constructor
+    cuda::DynamicStitch)                                                          // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp
@@ -149,4 +149,4 @@ KernelRegistrar kernel_registrar0(
 REGISTER_KERNEL_EMITTER(
     "Pad",                                                                        // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::Pad)                                                                    // constructor
+    cuda::Pad)                                                                    // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp
@@ -65,4 +65,4 @@ LanguageUnit_p cuda::Range::emit_dependency()
 REGISTER_KERNEL_EMITTER(
     "Range",                                                                      // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::Range)                                                                  // constructor
+    cuda::Range)                                                                  // constructor