Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[hold until next release] Onnx bert #240

Draft
wants to merge 37 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
1e112ac
refactor and add some fp16
Niupple Nov 25, 2020
de54196
fix compiling error
Niupple Nov 25, 2020
272d394
changes for fp16
Niupple Nov 25, 2020
7215c91
add pass info to log
Cjkkkk Nov 26, 2020
ebe94e0
fix
Cjkkkk Nov 26, 2020
2e75783
update cudnn datatype mapping
Cjkkkk Nov 26, 2020
0ac891f
add fp16 header to header file
Cjkkkk Nov 26, 2020
1e38b59
vgg11 runnable
Niupple Nov 27, 2020
c91a70c
code sytle applied
Niupple Nov 27, 2020
a7dcf76
Merge branch 'master' into fp16_kernel
Niupple Dec 1, 2020
1ede972
meet master
Niupple Dec 2, 2020
a3d43f0
fp16 runnable
Niupple Dec 3, 2020
7d12160
fix macro newline
Niupple Dec 4, 2020
f920996
check device type
Niupple Dec 4, 2020
b71e683
code stype
Niupple Dec 7, 2020
76fe4fd
fix ROCm unsupported LU's
Niupple Dec 8, 2020
7527c84
Merge branch 'fp16_kernel' into onnx_bert
Niupple Dec 10, 2020
a7bf41e
fix DataBuffer
Niupple Dec 11, 2020
1e42180
onnx changed to DataBuffer style import
Niupple Dec 11, 2020
18ab500
fix onnx fp16
Niupple Dec 17, 2020
979a124
Merge branch 'master' into onnx_bert
Niupple Dec 17, 2020
9dd7529
bert l1 runnable
Niupple Dec 29, 2020
f015bc6
change priority of evaluator runtime
Niupple Jan 5, 2021
86ccb2d
changes for fp16
Niupple Nov 25, 2020
e9fa286
vgg11 runnable
Niupple Nov 27, 2020
64af7da
code sytle applied
Niupple Nov 27, 2020
3dfda19
fix DataBuffer
Niupple Dec 11, 2020
0bd8eba
onnx changed to DataBuffer style import
Niupple Dec 11, 2020
9cefad5
fix onnx fp16
Niupple Dec 17, 2020
6ded162
bert l1 runnable
Niupple Dec 29, 2020
66abc20
change priority of evaluator runtime
Niupple Jan 5, 2021
44b3ba4
Merge branch 'onnx_bert' of github.com:microsoft/nnfusion into onnx_bert
Niupple Mar 10, 2021
f4598b7
delete comments
Niupple Mar 10, 2021
ef52767
codesytle
Niupple Mar 10, 2021
80904a2
fix dot
Niupple Mar 10, 2021
65cc6f7
repetitive include
Niupple Mar 10, 2021
40716bb
fix a semicolon missing
Niupple Mar 10, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/nnfusion/core/kernels/cpu/reference/constant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,4 @@ using namespace nnfusion;
using namespace nnfusion::kernels;
REGISTER_KERNEL_EMITTER("Constant", //op_name
Device(GENERIC_CPU).TypeConstraint(element::f32), //attrs
cpu::Constant) // constructor
cpu::Constant) // constructor
2 changes: 1 addition & 1 deletion src/nnfusion/core/kernels/cpu/reference/variable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,4 @@ using namespace nnfusion;
using namespace nnfusion::kernels;
REGISTER_KERNEL_EMITTER("Variable", //op_name
Device(GENERIC_CPU).TypeConstraint(element::f32), //attrs
cpu::Variable) // constructor
cpu::Variable) // constructor
5 changes: 5 additions & 0 deletions src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ LanguageUnit_p cuda::get_math_kernel(const std::string& name,
writer << ")\n";
writer << "{\n";
writer.indent++;
if (name == "convert" && data_types[num_inputs] == "half" && data_types[0] == "int64_t")
{
writer << "return (long long)" + math_kernel << ";\n";
}
else
{
writer << "return " + math_kernel << ";\n";
}
Expand Down
2 changes: 1 addition & 1 deletion src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,4 @@ using namespace nnfusion::kernels;
REGISTER_KERNEL_EMITTER(
"ApplyAdam",
Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
cuda::ApplyAdam)
cuda::ApplyAdam)
18 changes: 16 additions & 2 deletions src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
// [a] ./new_kernel_0.cpp
// [b] ../../../ops/op_define/new_op_0.cpp

#include <string>
#include "../cuda_emitter.hpp"
#include "../cuda_langunit.hpp"
#include "nnfusion/core/operators/generic_op/generic_op.hpp"
Expand Down Expand Up @@ -52,6 +53,15 @@ namespace nnfusion
const nnfusion::Shape& input_shape_0 = m_context->inputs[0]->get_shape();
const nnfusion::Shape& input_shape_1 = m_context->inputs[1]->get_shape();

element::Type dtype0 = m_context->inputs[0]->get_element_type();
element::Type dtype1 = m_context->inputs[1]->get_element_type();
element::Type dtype2 = m_context->outputs[0]->get_element_type();
NNFUSION_CHECK(dtype0 == dtype1 && dtype1 == dtype2)
<< "Unsupported element type combination of (" << dtype0.c_type_string()
<< ", " << dtype1.c_type_string() << ") -> " << dtype2.c_type_string()
<< ".";
element::Type& dtype = dtype0;

bool transA = generic_op->localOpConfig.getRoot()["adj_x"]["b"];
bool transB = generic_op->localOpConfig.getRoot()["adj_y"]["b"];
size_t A1 = 1LU;
Expand Down Expand Up @@ -92,10 +102,11 @@ namespace nnfusion
stride_b = A2 * A3, ldc = A4, stride_c = A2 * A4;
}

std::string type = dtype.c_type_string();
float alpha = 1.0f, beta = 0.0f;
auto code = nnfusion::op::create_code_from_template(
R"(
static const float alpha = @alpha@F, beta = @beta@F;
static const @dtype@ alpha = @alpha@, beta = @beta@;
// if (!@hCublas@)
// CUBLAS_SAFE_CALL(@api_create@(&@hCublas@));
CUBLAS_SAFE_CALL(@api_exec@(
Expand All @@ -106,7 +117,9 @@ namespace nnfusion
{
{"hCublas", "cublas_handle"},
{"api_create", "cublasCreate"},
{"api_exec", "cublasSgemmStridedBatched"},
{"api_exec",
dtype == element::f32 ? "cublasSgemmStridedBatched"
: "cublasHgemmStridedBatched"},
{"transA", transB ? "CUBLAS_OP_T" : "CUBLAS_OP_N"},
{"transB", transA ? "CUBLAS_OP_T" : "CUBLAS_OP_N"},
{"alpha", alpha},
Expand All @@ -121,6 +134,7 @@ namespace nnfusion
{"stride_b", stride_b},
{"stride_c", stride_c},
{"batch", A1},
{"dtype", type},
});

LanguageUnit_p _lu(new LanguageUnit(get_function_name()));
Expand Down
2 changes: 1 addition & 1 deletion src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,4 @@ using namespace nnfusion;
using namespace nnfusion::kernels;
REGISTER_KERNEL_EMITTER("Constant", //op_name
Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs
cuda::Constant) // constructor
cuda::Constant) // constructor
243 changes: 95 additions & 148 deletions src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ LanguageUnit_p cuda::Dot::emit_function_body()
// matrix * vector
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1))
{
lu << "const float alpha = 1.0;\n const float beta = 0;\n";
lu << "const float alpha = 1.0;\n const float beta = 0.;\n";
lu << "CUBLAS_SAFE_CALL(cublasSgemv(cublas_handle, ";
if (trans_A)
lu << "CUBLAS_OP_N, " << arg0_shape[0] << ", " << arg0_shape[1] << ", ";
Expand All @@ -107,7 +107,7 @@ LanguageUnit_p cuda::Dot::emit_function_body()
int n = trans_A ? arg0_shape[1] : arg0_shape[0];
int k = trans_A ? arg0_shape[0] : arg0_shape[1];

lu << "const float alpha = 1.0;\nconst float beta = 0;\n";
lu << "const float alpha = 1.0;\nconst float beta = 0.;\n";

lu << "CUBLAS_SAFE_CALL(cublasSgemm(cublas_handle,"
<< (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,")
Expand Down Expand Up @@ -186,7 +186,7 @@ LanguageUnit_p cuda::Dot::emit_function_body()
}
}

lu << "const float alpha = 1.0;\nconst float beta = 0;\n";
lu << "const float alpha = 1.0;\nconst float beta = 0.;\n";

lu << "CUBLAS_SAFE_CALL(cublasSgemm(cublas_handle,"
<< " CUBLAS_OP_N,"
Expand All @@ -206,162 +206,109 @@ LanguageUnit_p cuda::Dot::emit_function_body()
}
else if (dtype == element::f16)
{
// case 1: Scalar * Tensor
// if (arg0_shape.empty() || arg1_shape.empty())
// {
// auto& second = (arg0_shape.empty() ? arg1_shape : arg0_shape);
// size_t count = nnfusion::shape_size(second);

// string firstarg = (arg0_shape.empty() ? "input1" : "input0");
// string secondarg = (arg0_shape.empty() ? "input0" : "input1");

// lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";

// lu << "CUDA_SAFE_CALL(cudaMemcpy(outupt0, " << firstarg << ", " << count << ", cudaMemcpyDeviceToDevice));\n"; // copy `firstarg` to `output0`
// lu << "CUBLAS_SAFE_CALL(nnfusionHalfScale(" << secondarg << ", output0, " << count << "));\n";
// }
// // case 2: 1d Dot
// else if ((arg0_shape.size() == arg1_shape.size()) && (arg0_shape.size() == reduction_axes))
// {
// for (int i = 0; i < arg0_shape.size(); i++)
// {
// if (arg0_shape[i] != arg1_shape[i])
// {
// std::vector<std::string> arg_vec{"arg0", "arg1"};
// std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};

// NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
// << nnfusion::join(shape_vec) << " respectively, at Node "
// << m_context->gnode->get_name()
// << ", do not match for dot op";
// }
// }

// size_t count = nnfusion::shape_size(arg0_shape);
// lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";

// lu << "CUBLAS_SAFE_CALL(cublasSdot(cublas_handle, " << count
// << ", static_cast<const float*>(input0), 1, static_cast<const float*>(input1), 1, "
// "static_cast<float*>(output0)));\n";
// }
// // matrix * vector
// else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1))
// {
// lu << "const float alpha = 1.0;\n const float beta = 0;\n";
// lu << "CUBLAS_SAFE_CALL(cublasSgemv(cublas_handle, ";
// if (trans_A)
// lu << "CUBLAS_OP_N, " << arg0_shape[0] << ", " << arg0_shape[1] << ", ";
// else
// lu << "CUBLAS_OP_T, " << arg0_shape[1] << ", " << arg0_shape[0] << ", ";
// lu << " &alpha,"
// << " static_cast<const float*>(input0)," << arg0_shape[1] << ", "
// << " static_cast<const float*>(input1),"
// << " 1,"
// << " &beta,"
// << " static_cast<float*>(output0),"
// << " 1));\n";
// }
// else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) &&
// (trans_A || trans_B))
// {
// int m = trans_B ? arg1_shape[0] : arg1_shape[1];
// int n = trans_A ? arg0_shape[1] : arg0_shape[0];
// int k = trans_A ? arg0_shape[0] : arg0_shape[1];

// lu << "const half alpha = 1.0;\nconst half beta = 0;\n";

// lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
// << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,")
// << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << ","
// << " " << n << ","
// << " " << k << ","
// << " &alpha,"
// << " static_cast<const half*>(input1),"
// << " " << arg1_shape[1] << ","
// << " static_cast<const half*>(input0),"
// << " " << arg0_shape[1] << ","
// << " &beta,"
// << " static_cast<half*>(output0),"
// << " " << m << "));\n";
// } else {
size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
size_t axes_for_k_count = reduction_axes;
size_t m = 1;
size_t n = 1;
size_t k = 1;

// check if input and output size correct
// check and calculate k for arg0 and arg1
size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
size_t arg1_k_idx = 0; // first axe in arg1 for k

for (size_t i = 0; i < axes_for_k_count; i++)
if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) &&
(trans_A || trans_B))
{
k *= arg0_shape[arg0_k_idx];
if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
{
std::vector<std::string> arg_vec{"arg0", "arg1"};
std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
int m = trans_B ? arg1_shape[0] : arg1_shape[1];
int n = trans_A ? arg0_shape[1] : arg0_shape[0];
int k = trans_A ? arg0_shape[0] : arg0_shape[1];

NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
<< nnfusion::join(shape_vec) << " respectively, at Node "
<< m_context->gnode->get_name()
<< ", do not match for dot op";
}
lu << "const half alpha = 1.0;\nconst half beta = 0.;\n";

lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
<< (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,")
<< (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << ","
<< " " << n << ","
<< " " << k << ","
<< " &alpha,"
<< " static_cast<const half*>(input1),"
<< " " << arg1_shape[1] << ","
<< " static_cast<const half*>(input0),"
<< " " << arg0_shape[1] << ","
<< " &beta,"
<< " static_cast<half*>(output0),"
<< " " << m << "));\n";
}
// check and calculate m for arg0 and out
size_t arg0_m_idx = 0; // first axe in arg0 for m
size_t out_m_idx = 0; // first axe in out for m
for (size_t i = 0; i < axes_for_m_count; i++)
else
{
m *= arg0_shape[arg0_m_idx];
if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
size_t axes_for_k_count = reduction_axes;
size_t m = 1;
size_t n = 1;
size_t k = 1;

// check if input and output size correct
// check and calculate k for arg0 and arg1
size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
size_t arg1_k_idx = 0; // first axe in arg1 for k

for (size_t i = 0; i < axes_for_k_count; i++)
{
std::vector<std::string> arg_vec{"arg0", "output"};
std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
k *= arg0_shape[arg0_k_idx];
if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
{
std::vector<std::string> arg_vec{"arg0", "arg1"};
std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};

NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
<< nnfusion::join(shape_vec) << " respectively, at Node "
<< m_context->gnode->get_name()
<< ", do not match for dot op";
NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
<< nnfusion::join(shape_vec) << " respectively, at Node "
<< m_context->gnode->get_name()
<< ", do not match for dot op";
}
}
}
// check and calculate n for arg1 and out
size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n
for (size_t i = 0; i < axes_for_n_count; i++)
{
n *= arg1_shape[arg1_n_idx];
if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
// check and calculate m for arg0 and out
size_t arg0_m_idx = 0; // first axe in arg0 for m
size_t out_m_idx = 0; // first axe in out for m
for (size_t i = 0; i < axes_for_m_count; i++)
{
std::vector<std::string> arg_vec{"arg1", "output"};
std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
m *= arg0_shape[arg0_m_idx];
if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
{
std::vector<std::string> arg_vec{"arg0", "output"};
std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};

NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
<< nnfusion::join(shape_vec) << " respectively, at Node "
<< m_context->gnode->get_name()
<< ", do not match for dot op";
NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
<< nnfusion::join(shape_vec) << " respectively, at Node "
<< m_context->gnode->get_name()
<< ", do not match for dot op";
}
}
}
// check and calculate n for arg1 and out
size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n
for (size_t i = 0; i < axes_for_n_count; i++)
{
n *= arg1_shape[arg1_n_idx];
if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
{
std::vector<std::string> arg_vec{"arg1", "output"};
std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};

lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";

lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
<< " CUBLAS_OP_N,"
<< " CUBLAS_OP_N,"
<< " " << n << ","
<< " " << m << ","
<< " " << k << ","
<< " &alpha,"
<< " static_cast<const half*>(input1),"
<< " " << n << ","
<< " static_cast<const half*>(input0),"
<< " " << k << ","
<< " &beta,"
<< " static_cast<half*>(output0),"
<< " " << n << "));\n";
// }
NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
<< nnfusion::join(shape_vec) << " respectively, at Node "
<< m_context->gnode->get_name()
<< ", do not match for dot op";
}
}

lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";

lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
<< " CUBLAS_OP_N,"
<< " CUBLAS_OP_N,"
<< " " << n << ","
<< " " << m << ","
<< " " << k << ","
<< " &alpha,"
<< " static_cast<const half*>(input1),"
<< " " << n << ","
<< " static_cast<const half*>(input0),"
<< " " << k << ","
<< " &beta,"
<< " static_cast<half*>(output0),"
<< " " << n << "));\n";
}
}
else
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,4 @@ LanguageUnit_p cuda::DynamicStitch::emit_dependency()
REGISTER_KERNEL_EMITTER(
"DynamicStitch", // op_name
Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
cuda::DynamicStitch) // constructor
cuda::DynamicStitch) // constructor
2 changes: 1 addition & 1 deletion src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,4 +149,4 @@ KernelRegistrar kernel_registrar0(
REGISTER_KERNEL_EMITTER(
"Pad", // op_name
Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
cuda::Pad) // constructor
cuda::Pad) // constructor
2 changes: 1 addition & 1 deletion src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,4 @@ LanguageUnit_p cuda::Range::emit_dependency()
REGISTER_KERNEL_EMITTER(
"Range", // op_name
Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
cuda::Range) // constructor
cuda::Range) // constructor
Loading