From f72b206513c52df80012e9de4178befdf363a0d4 Mon Sep 17 00:00:00 2001 From: Jilong Xue Date: Fri, 15 Apr 2022 17:46:26 +0000 Subject: [PATCH] revert broadcast execution and fix batchnorm folding --- .../engine/pass/codegen/cuda_codegen_pass.cpp | 3 ++- .../batchnorm_inference_folding_pass.cpp | 26 +++++++++---------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp index 0e1fb1bfe..c03435ceb 100644 --- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp +++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp @@ -1133,6 +1133,7 @@ void CudaCodegenPass::create_main_file(std::shared_ptr ctx, lu_main << get_d2hcopy(tu)->get_code(); lu_main << get_sync()->get_code(); } + /* for (size_t i = 0; i < tu->out.size(); i++) { auto& tensor = *tu->out[i]; @@ -1143,7 +1144,7 @@ void CudaCodegenPass::create_main_file(std::shared_ptr ctx, << "\nprintf(\" .. (size = " << tensor.get_tensor_layout()->get_size() << ", ends with %e);\\n\", (float)" << tensor.get_name() << "_host[" << tensor.get_tensor_layout()->get_size() - 1 << "]);\n"; - } + }*/ lu_main.block_end(); lu_main << "\n//GPU time measurement\n"; diff --git a/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp b/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp index 6f691b32d..a90015390 100644 --- a/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp +++ b/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp @@ -622,10 +622,10 @@ class BatchNormInferenceOptimizer auto new_broadcast_gnode = m_graph->add_node_and_edge( std::make_shared(conv_output_shape, broadcast_axes), {new_conv_bias_gnode}); - shared_ptr ke_ctx(new KernelContext(new_broadcast_gnode)); - KernelEmitter::Pointer any_op_ke = std::make_shared(ke_ctx); - any_op_ke->get_or_emit_source(); - (*new_broadcast_gnode)["Kernel_Selection_Result"] = std::make_pair(NNFusion_DeviceType::CUDA_GPU, any_op_ke); + // shared_ptr ke_ctx(new KernelContext(new_broadcast_gnode)); + // KernelEmitter::Pointer any_op_ke = std::make_shared(ke_ctx); + // any_op_ke->get_or_emit_source(); + // (*new_broadcast_gnode)["Kernel_Selection_Result"] = std::make_pair(NNFusion_DeviceType::CUDA_GPU, any_op_ke); m_nodes.resize(m_graph->get_max_node_id()); m_nodes[new_broadcast_gnode->get_id()] = std::make_shared(); @@ -796,10 +796,10 @@ class BatchNormInferenceOptimizer auto new_broadcast_gnode = m_graph->add_node_and_edge( std::make_shared(conv_output_shape, broadcast_axes), {bn_node->get_in_edge(1)->get_src()}); - shared_ptr ke_ctx(new KernelContext(new_broadcast_gnode)); - KernelEmitter::Pointer any_op_ke = std::make_shared(ke_ctx); - any_op_ke->get_or_emit_source(); - (*new_broadcast_gnode)["Kernel_Selection_Result"] = std::make_pair(NNFusion_DeviceType::CUDA_GPU, any_op_ke);; + // shared_ptr ke_ctx(new KernelContext(new_broadcast_gnode)); + // KernelEmitter::Pointer any_op_ke = std::make_shared(ke_ctx); + // any_op_ke->get_or_emit_source(); + // (*new_broadcast_gnode)["Kernel_Selection_Result"] = std::make_pair(NNFusion_DeviceType::CUDA_GPU, any_op_ke);; m_nodes.resize(m_graph->get_max_node_id()); m_nodes[new_broadcast_gnode->get_id()] = std::make_shared(); m_nodes[new_broadcast_gnode->get_id()]->node = new_broadcast_gnode; @@ -903,11 +903,11 @@ bool BatchNormInferenceFoldingPass::run_on_graph(std::shared_ptrget_name(); }