Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OPT model: Fix Where and ScatterND bugs; Free host memory after copying to device for HLSL #453

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/nnfusion/common/type/element_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ bool element::Type::nnfusion_element_type_to_dtype_string(const element::Type& n
std::string& dtype)
{
if (ng_et == element::boolean)
dtype = "char";
dtype = "int";
else if (ng_et == element::character)
dtype = "char";
else if (ng_et == element::f16)
Expand Down
25 changes: 25 additions & 0 deletions src/nnfusion/core/graph/gnode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,7 @@ void FusedGNode::set_inputs_and_outputs(std::shared_ptr<Graph> graph)
m_op_ctxs.push_back(ctx);
}

std::unordered_map<std::shared_ptr<GNode>, std::unordered_map<size_t, size_t>> input_id_map;
// Register input tensors
for (const auto& m_node : m_order_nodes)
{
Expand All @@ -430,6 +431,7 @@ void FusedGNode::set_inputs_and_outputs(std::shared_ptr<Graph> graph)
set_input(input_id, m_node->get_inputs().at(in_edge->get_dst_input()));
graph->add_edge(
in_edge->get_src(), in_edge->get_src_output(), shared_from_this(), input_id);
input_id_map[m_node][in_edge->get_dst_input()] = input_id;
}
}
// Add control-edges as inputs of fused node
Expand Down Expand Up @@ -461,6 +463,29 @@ void FusedGNode::set_inputs_and_outputs(std::shared_ptr<Graph> graph)
has_output = true;
set_output(get_output_size(),
m_node->get_outputs().at(out_edge->get_src_output()));

// get inplace annotation
auto op = std::dynamic_pointer_cast<Op>(m_node->get_op_ptr());
auto op_annotations = op->get_op_annotations();
if (op_annotations)
{
auto oi_pairs = op_annotations->get_in_place_oi_pairs();
for (auto oi_pair : oi_pairs)
{
auto iter = input_id_map.find(m_node);
if (iter != input_id_map.end() && iter->second.count(oi_pair.input) > 0)
{
auto fused_op =
std::dynamic_pointer_cast<Op>(shared_from_this()->get_op_ptr());
AddInplace(fused_op,
get_output_size() - 1,
iter->second[oi_pair.input],
oi_pair.destructive,
oi_pair.force_inplace);
//NNFUSION_LOG(INFO) << "========================: node=" << m_node->get_op_type() << ", oi: <" << oi_pair.output << ", " << oi_pair.input << ">";
}
}
}
}
graph->add_edge(shared_from_this(),
get_output_size() - 1,
Expand Down
2 changes: 1 addition & 1 deletion src/nnfusion/core/kernels/common_langunit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ LU_DEFINE(header::chrono, "#include <chrono>\n");
LU_DEFINE(header::ctime, "#include <ctime>\n");
LU_DEFINE(header::limits, "#include <limits>\n");
LU_DEFINE(header::iostream, "#include <iostream>\n");
LU_DEFINE(header::windows, "#include <windows.h>\n");
LU_DEFINE(header::windows, "#define NOMINMAX\n#include <windows.h>\n");
LU_DEFINE(header::unordered_map, "#include <unordered_map>\n");
LU_DEFINE(header::torch_extension, "#include <torch/extension.h>\n");

Expand Down
7 changes: 4 additions & 3 deletions src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,11 +201,12 @@ LanguageUnit_p cuda::get_cudnn_convolution_descriptor(const Shape& padding,
<< "window_dilation_strides_int, CUDNN_CROSS_CORRELATION, " << data_type << "));\n";
}

if(type == nnfusion::element::f16){
if (type == nnfusion::element::f16)
{
// half precision, use tensor core
lu << "CUDNN_SAFE_CALL(cudnnSetConvolutionMathType(" << desc << ", "
<< "CUDNN_TENSOR_OP_MATH"
<< "));\n";
<< "CUDNN_TENSOR_OP_MATH"
<< "));\n";
}

return _lu;
Expand Down
9 changes: 6 additions & 3 deletions src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,9 @@ namespace nnfusion
@hCublas@, @transA@, @transB@, @m@, @n@, @k@,
&alpha, input1, @lda@, @stride_a@, input0, @ldb@, @stride_b@,
&beta, output0, @ldc@, @stride_c@, @batch@));
)" :
R"(
)"
:
R"(
static const float alpha = @alpha@F, beta = @beta@F;
// if (!@hCublas@)
// CUBLAS_SAFE_CALL(@api_create@(&@hCublas@));
Expand All @@ -116,7 +117,9 @@ namespace nnfusion
{
{"hCublas", "cublas_handle"},
{"api_create", "cublasCreate"},
{"api_exec", dtype == nnfusion::element::f16 ? "cublasHgemmStridedBatched" : "cublasSgemmStridedBatched"},
{"api_exec",
dtype == nnfusion::element::f16 ? "cublasHgemmStridedBatched"
: "cublasSgemmStridedBatched"},
{"transA", transB ? "CUBLAS_OP_T" : "CUBLAS_OP_N"},
{"transB", transA ? "CUBLAS_OP_T" : "CUBLAS_OP_N"},
{"alpha", alpha},
Expand Down
10 changes: 7 additions & 3 deletions src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,10 +171,14 @@ LanguageUnit_p cuda::BatchNormNCHW::emit_function_body()
/*
* todo: may have better solution, details in https://github.com/microsoft/nnfusion/issues/434
* */
if(dtype == nnfusion::element::f16){
lu << "output0[st + i] = __hadd(input1[c_id] , __hdiv(__hmul(input0[c_id], __hsub(input2[st + i], input3[c_id])), sqrtf(__hadd(__float2half("
if (dtype == nnfusion::element::f16)
{
lu << "output0[st + i] = __hadd(input1[c_id] , __hdiv(__hmul(input0[c_id], "
"__hsub(input2[st + i], input3[c_id])), sqrtf(__hadd(__float2half("
<< epsilon << "), input4[c_id]))));\n";
}else{
}
else
{
lu << "(input1[c_id] + (input0[c_id] * "
"(input2[st + i] - input3[c_id]) / sqrtf("
<< epsilon << " + input4[c_id])));\n";
Expand Down
Loading