From 912c1267c310f1b8c44868d6468415858e00b614 Mon Sep 17 00:00:00 2001 From: Yuqing Date: Tue, 7 Dec 2021 12:41:51 +0000 Subject: [PATCH 01/24] tmp fix --- .../core/kernels/cuda_gpu/cuda_cudnn.cpp | 15 +++-- .../kernels/cuda_gpu/kernels/avg_pool.cpp | 23 +++---- src/nnfusion/engine/device/cuda.cpp | 7 +- src/nnfusion/engine/pass/graph/CMakeLists.txt | 1 + src/nnfusion/engine/pass/graph/dump_op.cpp | 67 +++++++++++++++++++ src/nnfusion/engine/pass/graph/dump_op.hpp | 25 +++++++ .../conv_elementwise_fusion_optimizer.cpp | 21 ++++-- 7 files changed, 136 insertions(+), 23 deletions(-) create mode 100644 src/nnfusion/engine/pass/graph/dump_op.cpp create mode 100644 src/nnfusion/engine/pass/graph/dump_op.hpp diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp index 0926fb7e2..7dd34f96c 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp @@ -289,16 +289,21 @@ LanguageUnit_p { dimensions[pos++] = static_cast(shape[i]); } + // lu << "CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptorEx(" << desc << ", " << data_type << ", " + // << dimensions[0] << ", " << dimensions[1] << ", " << dimensions[2] << ", " + // << dimensions[3] << ", 1, 1, 1, 1));\n"; lu << "CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptorEx(" << desc << ", " << data_type << ", " - << dimensions[0] << ", " << dimensions[1] << ", " << dimensions[2] << ", " - << dimensions[3] << ", 1, 1, 1, 1));\n"; + << "1, " << dimensions[1] << ", 1, 1, 1, 1, 1, 1));\n"; } else if (shape.size() == 4) { + // lu << "CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptorEx(" << desc << ", " << data_type << ", " + // << static_cast(shape[0]) << ", " << static_cast(shape[1]) << ", " + // << static_cast(shape[2]) << ", " << static_cast(shape[3]) + // << ", 1, 1, 1, 1));\n"; + lu << "CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptorEx(" << desc << ", " << data_type << ", " - << static_cast(shape[0]) << ", " << static_cast(shape[1]) << ", " - << static_cast(shape[2]) << ", " << static_cast(shape[3]) - << ", 1, 1, 1, 1));\n"; + << "1, " << static_cast(shape[1]) << ",1, 1, 1, 1, 1, 1));\n"; } return _lu; diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp index ca26d3980..7a0fbae85 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp @@ -282,9 +282,6 @@ cuda::AvgPoolmD::AvgPoolmD(shared_ptr ctx) LanguageUnit_p cuda::AvgPoolmD::emit_function_body() { - if (input_shape.size() != 4 && input_shape.size() != 5) - return nullptr; - LanguageUnit_p _lu(new LanguageUnit(get_function_name())); auto& lu = *_lu; auto rank = input_shape.size(); @@ -297,8 +294,8 @@ LanguageUnit_p cuda::AvgPoolmD::emit_function_body() window_shape.insert(window_shape.begin(), 1); padding_below.insert(padding_below.begin(), 0); window_stride.insert(window_stride.begin(), 1); - _input_shape.insert(_input_shape.begin() + 1, 1); - _output_shape.insert(_output_shape.begin() + 1, 1); + _input_shape.insert(_input_shape.begin() + 2, 1); + _output_shape.insert(_output_shape.begin() + 2, 1); rank = 4; } @@ -517,10 +514,10 @@ LanguageUnit_p cuda::AvgPoolmDGrad::emit_function_body() window_shape.insert(window_shape.begin(), 1); padding_below.insert(padding_below.begin(), 0); window_stride.insert(window_stride.begin(), 1); - _input_shape.insert(_input_shape.begin() + 1, 1); - _output_shape.insert(_output_shape.begin() + 1, 1); - _d_input_shape.insert(_d_input_shape.begin() + 1, 1); - _d_output_shape.insert(_d_output_shape.begin() + 1, 1); + _input_shape.insert(_input_shape.begin() + 2, 1); + _output_shape.insert(_output_shape.begin() + 2, 1); + _d_input_shape.insert(_d_input_shape.begin() + 2, 1); + _d_output_shape.insert(_d_output_shape.begin() + 2, 1); rank = 4; } @@ -609,10 +606,10 @@ LanguageUnit_p cuda::AvgPoolmDGrad::emit_function_body() return _lu; } -REGISTER_KERNEL_EMITTER( - "AvgPool", // op_name - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::AvgPool1D) // constructor +// REGISTER_KERNEL_EMITTER( +// "AvgPool", // op_name +// Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs +// cuda::AvgPool1D) // constructor REGISTER_KERNEL_EMITTER( "AvgPool", // op_name diff --git a/src/nnfusion/engine/device/cuda.cpp b/src/nnfusion/engine/device/cuda.cpp index 738c607c0..63dcf7437 100644 --- a/src/nnfusion/engine/device/cuda.cpp +++ b/src/nnfusion/engine/device/cuda.cpp @@ -11,6 +11,7 @@ #include "nnfusion/engine/pass/graph/blockfusion_pass.hpp" #include "nnfusion/engine/pass/graph/common_subexpression_elimination_pass.hpp" #include "nnfusion/engine/pass/graph/dot_transpose_pass.hpp" +#include "nnfusion/engine/pass/graph/dump_op.hpp" #include "nnfusion/engine/pass/graph/gemm_fusion_pass.hpp" #include "nnfusion/engine/pass/graph/gnode_device_dispatcher.hpp" #include "nnfusion/engine/pass/graph/gradient_weight_mapping_pass.hpp" @@ -46,7 +47,9 @@ CudaEngine::CudaEngine() : Engine() { g_passes->push_back(make_shared()); - g_passes->push_back(make_shared()); + // g_passes->push_back(make_shared()); + // g_passes->push_back(make_shared()); + // g_passes->push_back(make_shared()); g_passes->push_back(make_shared()); g_passes->push_back(make_shared()); g_passes->push_back(make_shared()); @@ -54,6 +57,7 @@ CudaEngine::CudaEngine() g_passes->push_back(make_shared()); g_passes->push_back(make_shared()); g_passes->push_back(make_shared()); + g_passes->push_back(make_shared()); g_passes->push_back(make_shared()); //superscaler pass g_passes->push_back(make_shared()); @@ -69,6 +73,7 @@ CudaEngine::CudaEngine() g_passes->push_back(make_shared()); g_passes->push_back(make_shared()); g_passes->push_back(make_shared()); + g_passes->push_back(make_shared()); g_passes->push_back(make_shared()); g_passes->push_back(make_shared()); g_passes->push_back(make_shared()); diff --git a/src/nnfusion/engine/pass/graph/CMakeLists.txt b/src/nnfusion/engine/pass/graph/CMakeLists.txt index b067120be..f68c96c71 100644 --- a/src/nnfusion/engine/pass/graph/CMakeLists.txt +++ b/src/nnfusion/engine/pass/graph/CMakeLists.txt @@ -29,6 +29,7 @@ set(SRC superscaler_dataparallelism_pass.cpp ir_based_fusion_pass.cpp subgraph_fusion_pass.cpp + dump_op.cpp ) diff --git a/src/nnfusion/engine/pass/graph/dump_op.cpp b/src/nnfusion/engine/pass/graph/dump_op.cpp new file mode 100644 index 000000000..a7b3b66ca --- /dev/null +++ b/src/nnfusion/engine/pass/graph/dump_op.cpp @@ -0,0 +1,67 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "dump_op.hpp" + +using namespace nnfusion; +using namespace nnfusion::pass::graph; +using namespace nnfusion::profiler; + +DEFINE_string(fdump_op_file, "", ""); + +bool DumpOp::run_on_graph(std::shared_ptr& graph) +{ + if (FLAGS_fdump_op_file == "") + return true; + + std::ofstream out(FLAGS_fdump_op_file); + out << "op\tinput\toutput\tfused nodes\n"; + std::vector> nodes = graph->get_ordered_ops(); + for (auto it : nodes) + { + out << it->get_op_type() << "\t"; + for (size_t i = 0; i < it->get_inputs().size(); i++) + { + auto in_shape = it->get_input_shape(i); + out << in_shape; + } + out << "\t"; + + for (size_t i = 0; i < it->get_outputs().size(); i++) + { + auto out_shape = it->get_output_shape(i); + out << out_shape; + } + out << "\t"; + if (it->get_op_type() == "ElementWiseFused") + { + auto node = std::static_pointer_cast(it); + NNFUSION_CHECK_NOT_NULLPTR(node); + auto ctxs = node->get_op_contexts(); + for (auto c : ctxs) + { + out << c->op->get_op_type() << ", inputs: "; + for (size_t j = 0; j < c->inputs.size(); j++) + { + out << c->inputs[j]->get_shape() << ", "; + } + out << "outputs: "; + for (size_t j = 0; j < c->outputs.size(); j++) + { + out << c->outputs[j]->get_shape() << ", "; + } + out << "\t"; + } + } + else if (it->get_op_type() == "Convolution") + { + auto op = static_pointer_cast(it->get_op_ptr()); + NNFUSION_CHECK_NOT_NULLPTR(op); + out << op->get_activation() << "\t"; + } + out << "\n"; + } + out << std::endl; + + return true; +} \ No newline at end of file diff --git a/src/nnfusion/engine/pass/graph/dump_op.hpp b/src/nnfusion/engine/pass/graph/dump_op.hpp new file mode 100644 index 000000000..526020b34 --- /dev/null +++ b/src/nnfusion/engine/pass/graph/dump_op.hpp @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#pragma once + +#include "graph_pass_base.hpp" +#include "nnfusion/common/common.hpp" +#include "nnfusion/engine/cache/manager.hpp" +#include "nnfusion/engine/op.hpp" +#include "nnfusion/engine/profiler/profiler.hpp" + +namespace nnfusion +{ + namespace pass + { + namespace graph + { + class DumpOp : public GraphPassBase + { + public: + bool run_on_graph(std::shared_ptr& graph) override; + }; + } + } +} // namespace nnfusion \ No newline at end of file diff --git a/src/nnfusion/engine/pass/graph/subgraph_fusion_optimizer/conv_elementwise_fusion_optimizer.cpp b/src/nnfusion/engine/pass/graph/subgraph_fusion_optimizer/conv_elementwise_fusion_optimizer.cpp index 159182e19..e059c42db 100644 --- a/src/nnfusion/engine/pass/graph/subgraph_fusion_optimizer/conv_elementwise_fusion_optimizer.cpp +++ b/src/nnfusion/engine/pass/graph/subgraph_fusion_optimizer/conv_elementwise_fusion_optimizer.cpp @@ -43,12 +43,13 @@ bool ConvElemFusionOptimizer::fuse_subgraph(SubGraphRecord::Pointer subgraph_rec auto bias = pr->nodes[1]; std::shared_ptr relu; + // NNFUSION_LOG(INFO) <<" =============" << subgraph_record->subgraph->name; if (subgraph_record->subgraph->name == "conv_bias_relu") { relu = pr->nodes[2]; } - std::shared_ptr bias_input; + std::shared_ptr bias_input, bias_broadcast; int bias_input_idx; for (auto in_edge : bias->get_in_edges()) @@ -56,8 +57,20 @@ bool ConvElemFusionOptimizer::fuse_subgraph(SubGraphRecord::Pointer subgraph_rec auto src = in_edge->get_src(); if (src != conv) { - bias_input = src; - bias_input_idx = in_edge->get_src_output(); + if (src->get_op_type() == "Broadcast") + { + bias_broadcast = src; + bias_input = bias_broadcast->get_in_edge(0)->get_src(); + bias_input_idx = 0; + } + else + { + bias_input = src; + bias_input_idx = in_edge->get_src_output(); + } + + // NNFUSION_LOG(INFO) << "bias_input_idx: " << bias_input_idx; + // NNFUSION_LOG(INFO) << bias_input->get_op_type(); break; } } @@ -96,7 +109,7 @@ bool ConvElemFusionOptimizer::fuse_subgraph(SubGraphRecord::Pointer subgraph_rec graph->add_edge(new_conv, 0, dst, y); } std::unordered_set> nodes_to_remove; - nodes_to_remove.insert({conv, bias}); + nodes_to_remove.insert({conv, bias, bias_broadcast}); if (relu) nodes_to_remove.insert(relu); return RemoveNodes(nodes_to_remove, new_conv); From 8f90dc368771e5b57c939f085039e7c6e2fe1023 Mon Sep 17 00:00:00 2001 From: Yuqing Date: Wed, 8 Dec 2021 10:07:57 +0000 Subject: [PATCH 02/24] fix avgpool --- src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp index 7a0fbae85..ea7bf3bf5 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp @@ -508,7 +508,7 @@ LanguageUnit_p cuda::AvgPoolmDGrad::emit_function_body() auto _d_input_shape = d_input_shape; auto _output_shape = output_shape; auto _d_output_shape = d_output_shape; - + NNFUSION_LOG(INFO) << "---------4"; if (rank == 3) { window_shape.insert(window_shape.begin(), 1); @@ -520,7 +520,7 @@ LanguageUnit_p cuda::AvgPoolmDGrad::emit_function_body() _d_output_shape.insert(_d_output_shape.begin() + 2, 1); rank = 4; } - + NNFUSION_LOG(INFO) << "---------5"; // y dy x dx auto input_desc = cudnn_tensor_descriptor_from_shape(_input_shape, "input_desc", input_type); auto d_input_desc = @@ -602,7 +602,7 @@ LanguageUnit_p cuda::AvgPoolmDGrad::emit_function_body() lu << "CUDNN_SAFE_CALL(cudnnDestroyTensorDescriptor(output_desc));\n"; lu << "CUDNN_SAFE_CALL(cudnnDestroyTensorDescriptor(d_output_desc));\n"; lu << "CUDNN_SAFE_CALL(cudnnDestroyPoolingDescriptor(desc));\n"; - + NNFUSION_LOG(INFO) << "---------6"; return _lu; } From 2a118cd8b38a95e30dc82591fe90b093d28ac9cf Mon Sep 17 00:00:00 2001 From: Yuqing Date: Fri, 10 Dec 2021 10:53:57 +0000 Subject: [PATCH 03/24] add parser --- parse_code.py | 103 ++++++++++++++++++ src/nnfusion/engine/pass/graph/dump_op.cpp | 56 +++++++++- .../pass/graph/pattern_substitution.cpp | 3 +- .../nnfusion/kernel_db/convert_external.py | 36 +++++- 4 files changed, 193 insertions(+), 5 deletions(-) create mode 100644 parse_code.py diff --git a/parse_code.py b/parse_code.py new file mode 100644 index 000000000..bd7564dea --- /dev/null +++ b/parse_code.py @@ -0,0 +1,103 @@ +# [{"tvm_func_name": "manual_dot_nn_op_float_m1_k256_n256_kernel0", +# "op_type": "Dot", +# "parameters": {"arg0_shape": [1, 256], +# "arg1_shape": [256, 256], +# "out_shape": [1, 256], +# "transpose_A": false, +# "transpose_B": false}, +# "code": "extern \"C\" __global__ void manual_dot_nn_op_float_m1_k256_n256_kernel0(float* input0, float* input1, float* output0)\n{\n int warp_id = threadIdx.x >> 5;\n int lane_id = threadIdx.x & 31;\n int col_id = blockIdx.x * blockDim.x / 4 + lane_id;\n if (col_id < 256)\n {\n float val = 0;\n int k_start = warp_id * 64;\n int k_end = (warp_id + 1) * 64;\n for (int i = k_start; i < k_end; i++)\n {\n val = fma(input0[i], input1[i * 256 + col_id], val);\n }\n if (warp_id == 0)\n {\n output0[col_id]=0;\n }\n __syncthreads();\n atomicAdd(output0 + col_id, val);\n }\n\n}\n", +# "gridDim": [8, 1, 1], +# "blockDim": [128, 1, 1]}] + +import json +import sys +import argparse +parser=argparse.ArgumentParser() +parser = argparse.ArgumentParser() +parser.add_argument('--op_type', type=str, default='') +parser.add_argument('--source_file', type=str, default='') +parser.add_argument('--json_file', type=str, default='example.json') +parser.add_argument("--input0_shape", nargs="*", type=int,default=[1, 2, 3]) +parser.add_argument("--input1_shape", nargs="*", type=int,default=[1, 2, 3]) +parser.add_argument("--output0_shape", nargs="*", type=int,default=[1, 2, 3]) +parser.add_argument("--transpose_A", type=bool, default=False) +parser.add_argument("--transpose_B", type=bool, default=False) +parser.add_argument("--stride", nargs="*", type=int,default=[1, 1]) +parser.add_argument("--padding", nargs="*", type=int,default=[0, 0]) +parser.add_argument("--dilation", nargs="*", type=int,default=[1, 1]) +parser.add_argument("--window_shape", nargs="*", type=int,default=[1, 1]) +parser.add_argument("--reduction_axis", nargs="*", type=int,default=[0]) + +args = parser.parse_args() + +info = {} +info["parameters"] = {} +op_type = args.op_type +info["op_type"] = op_type +source_file = args.source_file +json_file = args.json_file +tvm_func_name = source_file.split("/")[-1][:-3] +info["tvm_func_name"] = tvm_func_name +if op_type == "Dot" or op_type == "BatchMatMul": + info["parameters"]["arg0_shape"] = args.input0_shape + info["parameters"]["arg1_shape"] = args.input1_shape + info["parameters"]["out_shape"] = args.output0_shape + info["parameters"]["transpose_A"] = args.transpose_A + info["parameters"]["transpose_B"] = args.transpose_B +elif op_type == "Convolution" or op_type == "DepthwiseConv2dNative": + info["parameters"]["input_shape"] = args.input0_shape + info["parameters"]["filter_shape"] = args.input1_shape + info["parameters"]["output_shape"] = args.output0_shape + info["parameters"]["window_movement_strides"] = args.stride + info["parameters"]["padding_below_diff"] = args.padding + info["parameters"]["window_dilation_strides"] = args.dilation +elif op_type == "MaxPool" or op_type == "AvgPool": + info["parameters"]["input_shape"] = args.input0_shape + info["parameters"]["output_shape"] = args.output0_shape + info["parameters"]["window_shape"] = args.window_shape + info["parameters"]["window_stride"] = args.stride + info["parameters"]["padding_below"] = args.padding +elif op_type == "Sum": + info["parameters"]["input_shape"] = args.input0_shape + info["parameters"]["output_shape"] = args.output0_shape + info["parameters"]["reduction_axis"] = args.reduction_axis +else: + info["parameters"]["input_shape"] = args.input0_shape + info["parameters"]["output_shape"] = args.output0_shape + + +code = "" +gridDim = [] +blockDim = [] +with open(source_file, 'r', encoding='utf-8') as f: + flag = False + for line in f.readlines(): + if flag: + code += line + if flag and line.startswith("}"): + flag = False + if line.startswith("extern \"C\" "): + line = line.replace("default_function_kernel0", tvm_func_name) + code += line + flag = True + if "dim3 grid(" in line: + line = line.split("(")[1].split(")")[0].split(",") + for i in line: + gridDim.append(int(i)) + if "dim3 block(" in line: + line = line.split("(")[1].split(")")[0].split(",") + for i in line: + blockDim.append(int(i)) + +info["code"] = code +info["gridDim"] = gridDim +info["blockDim"] = blockDim + +# json_file = source_file + ".json" +with open(json_file, 'w', encoding='utf-8') as fw: + json.dump(info, fw) + + + + + diff --git a/src/nnfusion/engine/pass/graph/dump_op.cpp b/src/nnfusion/engine/pass/graph/dump_op.cpp index a7b3b66ca..76e4f44eb 100644 --- a/src/nnfusion/engine/pass/graph/dump_op.cpp +++ b/src/nnfusion/engine/pass/graph/dump_op.cpp @@ -15,7 +15,8 @@ bool DumpOp::run_on_graph(std::shared_ptr& graph) return true; std::ofstream out(FLAGS_fdump_op_file); - out << "op\tinput\toutput\tfused nodes\n"; + out << "op\tinput\toutput\tfused nodes\tstride\tpad\tdilation or window shape\treduce " + "axis\tdata format\n"; std::vector> nodes = graph->get_ordered_ops(); for (auto it : nodes) { @@ -58,6 +59,59 @@ bool DumpOp::run_on_graph(std::shared_ptr& graph) auto op = static_pointer_cast(it->get_op_ptr()); NNFUSION_CHECK_NOT_NULLPTR(op); out << op->get_activation() << "\t"; + out << op->get_window_movement_strides() << "\t"; + out << op->get_padding_below() << ", " << op->get_padding_above() << "\t"; + out << op->get_window_dilation_strides() << "\t"; + } + else if (it->get_op_type() == "AvgPool") + { + auto op = static_pointer_cast(it->get_op_ptr()); + NNFUSION_CHECK_NOT_NULLPTR(op); + out << "\t"; + out << op->get_window_movement_strides() << "\t"; + out << op->get_padding_below() << ", " << op->get_padding_above() << "\t"; + out << op->get_window_shape() << "\t"; + } + else if (it->get_op_type() == "AvgPool") + { + auto op = static_pointer_cast(it->get_op_ptr()); + NNFUSION_CHECK_NOT_NULLPTR(op); + out << "\t"; + out << op->get_window_movement_strides() << "\t"; + out << op->get_padding_below() << ", " << op->get_padding_above() << "\t"; + out << op->get_window_shape() << "\t"; + } + else if (it->get_op_type() == "Pad") + { + auto op = static_pointer_cast(it->get_op_ptr()); + NNFUSION_CHECK_NOT_NULLPTR(op); + out << "\t"; + out << "\t"; + out << op->get_padding_below() << ", " << op->get_padding_above() << ", " + << op->get_padding_interior() << "\t"; + out << "\t"; + } + else if (it->get_op_type() == "Sum") + { + auto op = static_pointer_cast(it->get_op_ptr()); + NNFUSION_CHECK_NOT_NULLPTR(op); + out << "\t"; + out << "\t"; + out << "\t"; + out << "\t"; + out << op->get_reduction_axes() << "\t"; + } + else if (it->get_op_type() == "DepthwiseConv2dNative") + { + auto op = static_pointer_cast(it->get_op_ptr()); + NNFUSION_CHECK_NOT_NULLPTR(op); + out << "\t"; + out << op->localOpConfig.getRoot()["strides"] << "\t"; + out << op->localOpConfig.getRoot()["padding_before"] << ", " + << op->localOpConfig.getRoot()["padding_after"] << "\t"; + out << op->localOpConfig.getRoot()["dilations"] << "\t"; + out << "\t"; + out << op->localOpConfig.getRoot()["data_format"] << "\t"; } out << "\n"; } diff --git a/src/nnfusion/engine/pass/graph/pattern_substitution.cpp b/src/nnfusion/engine/pass/graph/pattern_substitution.cpp index 815c30850..1f34d267e 100644 --- a/src/nnfusion/engine/pass/graph/pattern_substitution.cpp +++ b/src/nnfusion/engine/pass/graph/pattern_substitution.cpp @@ -32,7 +32,8 @@ const static std::vector> PATTERNS = { // {"Convolution", "BatchNormInference"}, // Conv-BN-Relu is converted into Conv-Add-Relu {"Convolution", "Add", "Relu"}, - {"Convolution", "Relu"}}; + {"Convolution", "Relu"}, + {"Convolution", "Add"}}; REGISTER_OP(Matched_Pattern) // .attr("out_shape") diff --git a/src/tools/nnfusion/kernel_db/convert_external.py b/src/tools/nnfusion/kernel_db/convert_external.py index 5ab7626fb..2d9b7a369 100644 --- a/src/tools/nnfusion/kernel_db/convert_external.py +++ b/src/tools/nnfusion/kernel_db/convert_external.py @@ -25,6 +25,10 @@ # Todo: re-org operator definition to oop and coordinate to NNFusion param_list = { + "DepthwiseConv2dNative": { + 'symbol': ['input0', 'input1', 'output0'], + 'dtype': ['float*', 'float*', 'float*'] + }, "Convolution": { 'symbol': ['input0', 'input1', 'output0'], 'dtype': ['float*', 'float*', 'float*'] @@ -37,10 +41,18 @@ 'symbol': ['input0', 'output0'], 'dtype': ['float*', 'float*'] }, + "Sum": { + 'symbol': ['input0', 'output0'], + 'dtype': ['float*', 'float*'] + }, "Dot": { 'symbol': ['input0', 'input1', 'output0'], 'dtype': ['float*', 'float*', 'float*'] }, + "BatchMatMul": { + 'symbol': ['input0', 'input1', 'output0'], + 'dtype': ['float*', 'float*', 'float*'] + }, "Fused_Convolution_Relu": { 'symbol': ['input0', 'input1', 'output0'], 'dtype': ['float*', 'float*', 'float*'] @@ -57,6 +69,10 @@ 'symbol': ['input0', 'input1', 'output0', 'input2'], 'dtype': ['float*', 'float*', 'float*', 'float*'] }, + "Fused_Convolution_Add": { + 'symbol': ['input0', 'input1', 'output0', 'input2'], + 'dtype': ['float*', 'float*', 'float*', 'float*'] + }, "AvgPool": { 'symbol': ['input0', 'output0'], 'dtype': ['float*', 'float*'] @@ -64,7 +80,7 @@ } conv_augmented = ["Fused_Convolution_Batchnorm", - "Fused_Convolution_Batchnorm_Relu", "Fused_Convolution_Add_Relu"] + "Fused_Convolution_Batchnorm_Relu", "Fused_Convolution_Add_Relu", "Fused_Convolution_Add"] conv_family = ["Convolution", "Fused_Convolution_Relu"] + conv_augmented @@ -102,6 +118,13 @@ def gen_key(data, dtype="float"): for shape in out_shape * 2) + "float" * 2 * len(out_shape) else: raise ("to be specified") + elif op_type == "DepthwiseConv2dNative": + key += "".join(["Strides{", ", ".join(str(i) + for i in parameters["window_movement_strides"]), "}"]) + key += "".join(["Strides{", ", ".join(str(i) + for i in parameters["window_dilation_strides"]), "}"]) + key += "".join(["CoordinateDiff{", ", ".join(str(i) + for i in parameters["padding_below_diff"]), "}"]) elif op_type == "AvgPool" or op_type == "MaxPool": key += "Shape{" + ", ".join(str(i) for i in parameters["window_shape"]) + "}" @@ -109,6 +132,9 @@ def gen_key(data, dtype="float"): for i in parameters["window_stride"]) + "}" key += "Shape{" + ", ".join(str(i) for i in parameters["padding_below"]) + "}" + elif op_type == "Sum": + key += "AxisSet{" + ", ".join(str(i) + for i in parameters["reduction_axis"]) + "}" else: pass @@ -125,7 +151,7 @@ def gen_config(op_type, kernel, shared_memory, num_sync): "blockDim": kernel["blockDim"], "gridDim": kernel["gridDim"], } - if op_type in conv_family: + if op_type in conv_family or op_type == "DepthwiseConv2dNative": config["in_shape"] = [kernel["parameters"] ["input_shape"], kernel["parameters"]["filter_shape"]] config["out_shape"] = [kernel["parameters"]["output_shape"]] @@ -140,7 +166,7 @@ def gen_config(op_type, kernel, shared_memory, num_sync): "function_signature"] = "extern \"C\" __global__ void (float* input0, float* input1, float* input2, float* output0)" else: config["function_signature"] = "extern \"C\" __global__ void (float* input0, float* input1, float* output0)" - elif (op_type == "Dot"): + elif (op_type == "Dot" or op_type == "BatchMatMul"): config["in_shape"] = [kernel["parameters"] ["arg0_shape"], kernel["parameters"]["arg1_shape"]] config["out_shape"] = [kernel["parameters"]["out_shape"]] @@ -150,6 +176,10 @@ def gen_config(op_type, kernel, shared_memory, num_sync): config["in_shape"] = [kernel["parameters"]["input_shape"]] config["out_shape"] = [kernel["parameters"]["output_shape"]] config["function_signature"] = "extern \"C\" __global__ void (float* input0, float* output0)" + elif (op_type == "Sum"): + config["in_shape"] = [kernel["parameters"]["input_shape"]] + config["out_shape"] = [kernel["parameters"]["output_shape"]] + config["function_signature"] = "extern \"C\" __global__ void (float* input0, float* output0)" elif (op_type == "AvgPool" or op_type == "MaxPool"): config["in_shape"] = [kernel["parameters"]["input_shape"]] config["out_shape"] = [kernel["parameters"]["output_shape"]] From d6d34b381c4705a9f472efb27eadba432731078e Mon Sep 17 00:00:00 2001 From: Yuqing Date: Sat, 11 Dec 2021 10:10:44 +0000 Subject: [PATCH 04/24] remove launch bounds --- parse_code.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/parse_code.py b/parse_code.py index bd7564dea..60e3c6571 100644 --- a/parse_code.py +++ b/parse_code.py @@ -10,16 +10,16 @@ # "blockDim": [128, 1, 1]}] import json -import sys +import re import argparse parser=argparse.ArgumentParser() parser = argparse.ArgumentParser() -parser.add_argument('--op_type', type=str, default='') -parser.add_argument('--source_file', type=str, default='') -parser.add_argument('--json_file', type=str, default='example.json') -parser.add_argument("--input0_shape", nargs="*", type=int,default=[1, 2, 3]) +parser.add_argument('--op_type', required=True, type=str, default='') +parser.add_argument('--source_file', required=True, type=str, default='') +parser.add_argument('--json_file', required=True, type=str, default='example.json') +parser.add_argument("--input0_shape", required=True, nargs="*", type=int,default=[1, 2, 3]) parser.add_argument("--input1_shape", nargs="*", type=int,default=[1, 2, 3]) -parser.add_argument("--output0_shape", nargs="*", type=int,default=[1, 2, 3]) +parser.add_argument("--output0_shape", required=True, nargs="*", type=int,default=[1, 2, 3]) parser.add_argument("--transpose_A", type=bool, default=False) parser.add_argument("--transpose_B", type=bool, default=False) parser.add_argument("--stride", nargs="*", type=int,default=[1, 1]) @@ -78,6 +78,10 @@ flag = False if line.startswith("extern \"C\" "): line = line.replace("default_function_kernel0", tvm_func_name) + match = re.search("__launch_bounds__\([0-9]*\) ", line) + if match: + lb = match.group() + line = line.replace(lb, "") code += line flag = True if "dim3 grid(" in line: From 6feaed5307b64ee287303298e5afefa8d04808af Mon Sep 17 00:00:00 2001 From: Yuqing Date: Sat, 11 Dec 2021 10:13:17 +0000 Subject: [PATCH 05/24] replace parse_code.py location --- parse_code.py => src/tools/nnfusion/kernel_db/parse_code.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename parse_code.py => src/tools/nnfusion/kernel_db/parse_code.py (100%) diff --git a/parse_code.py b/src/tools/nnfusion/kernel_db/parse_code.py similarity index 100% rename from parse_code.py rename to src/tools/nnfusion/kernel_db/parse_code.py From b636141b1412bf7e24483c4b0136f683e69fc5fc Mon Sep 17 00:00:00 2001 From: Yuqing Date: Sun, 12 Dec 2021 07:00:12 +0000 Subject: [PATCH 06/24] ad_hoc change for batchmatmul and depthwiseconv2d --- .../DepthwiseConv2dNative.cpp | 7 +- src/nnfusion/engine/pass/graph/dump_op.cpp | 4 +- .../tensorflow_import/util/graph_convert.cpp | 69 +++++++++++++++++-- 3 files changed, 73 insertions(+), 7 deletions(-) diff --git a/src/nnfusion/core/operators/generic_op/generic_op_define/DepthwiseConv2dNative.cpp b/src/nnfusion/core/operators/generic_op/generic_op_define/DepthwiseConv2dNative.cpp index 7ebd95413..1fabb53b0 100644 --- a/src/nnfusion/core/operators/generic_op/generic_op_define/DepthwiseConv2dNative.cpp +++ b/src/nnfusion/core/operators/generic_op/generic_op_define/DepthwiseConv2dNative.cpp @@ -18,7 +18,12 @@ REGISTER_OP(DepthwiseConv2dNative) const Shape& input_shape = gnode->get_input_shape(0); // [ filter_rows, filter_cols, in_depth, depth_multiplier ] - const Shape& filter_shape = gnode->get_input_shape(1); + // const Shape& filter_shape = gnode->get_input_shape(1); + + // ad_hoc: [ in_depth, depth_multiplier, filter_rows, filter_cols ] + const Shape& filter_shape_ts = gnode->get_input_shape(1); + nnfusion::Shape filter_shape{ + filter_shape_ts[2], filter_shape_ts[3], filter_shape_ts[0], filter_shape_ts[1]}; std::string data_format = op->localOpConfig.getRoot()["data_format"]; bool is_nhwc = (data_format == "NHWC"); diff --git a/src/nnfusion/engine/pass/graph/dump_op.cpp b/src/nnfusion/engine/pass/graph/dump_op.cpp index 76e4f44eb..0f4e45e95 100644 --- a/src/nnfusion/engine/pass/graph/dump_op.cpp +++ b/src/nnfusion/engine/pass/graph/dump_op.cpp @@ -72,9 +72,9 @@ bool DumpOp::run_on_graph(std::shared_ptr& graph) out << op->get_padding_below() << ", " << op->get_padding_above() << "\t"; out << op->get_window_shape() << "\t"; } - else if (it->get_op_type() == "AvgPool") + else if (it->get_op_type() == "MaxPool") { - auto op = static_pointer_cast(it->get_op_ptr()); + auto op = static_pointer_cast(it->get_op_ptr()); NNFUSION_CHECK_NOT_NULLPTR(op); out << "\t"; out << op->get_window_movement_strides() << "\t"; diff --git a/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp b/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp index 46394865c..f514da652 100644 --- a/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp +++ b/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp @@ -358,17 +358,69 @@ namespace nnfusion status = GetNodeAttr(node.attr(), "adj_y", adj_y); NNFUSION_CHECK(status); + std::shared_ptr lhs_gnode_ts = lhs_gnode; + std::shared_ptr rhs_gnode_ts = rhs_gnode; + auto lhs_shape = lhs_gnode->get_shape(); + auto rhs_shape = rhs_gnode->get_shape(); int input_dims = lhs_gnode->get_output_shape(0).size(); + if (adj_x) + { + if (lhs_gnode->get_op_ptr()->is_constant()) + { + NNFUSION_LOG(INFO) << "batchmatmul lhs_gnode is constant. " << node.name(); + } + nnfusion::Shape ng_shape(input_dims); + for (size_t i = 0; i < input_dims - 2; i++) + { + ng_shape[i] = lhs_shape[i]; + } + + ng_shape[input_dims - 2] = lhs_shape[input_dims - 1]; + ng_shape[input_dims - 1] = lhs_shape[input_dims - 2]; + + nnfusion::AxisVector ng_axis_order{0, 1, 3, 2}; + auto reshape_op = + std::make_shared(ng_axis_order, ng_shape); + reshape_op->set_name(lhs_gnode->get_name() + "_ts"); + auto reshape_gnode = m_graph->add_node_and_edge(reshape_op, {lhs_gnode}); + lhs_gnode_ts = reshape_gnode; + } + + if (adj_y) + { + if (rhs_gnode->get_op_ptr()->is_constant()) + { + NNFUSION_LOG(INFO) << "batchmatmul rhs_gnode is constant. " << node.name(); + } + nnfusion::Shape ng_shape(input_dims); + for (size_t i = 0; i < input_dims - 2; i++) + { + ng_shape[i] = rhs_shape[i]; + } + + ng_shape[input_dims - 2] = rhs_shape[input_dims - 1]; + ng_shape[input_dims - 1] = rhs_shape[input_dims - 2]; + + nnfusion::AxisVector ng_axis_order{0, 1, 3, 2}; + auto reshape_op = + std::make_shared(ng_axis_order, ng_shape); + reshape_op->set_name(rhs_gnode->get_name() + "_ts"); + auto reshape_gnode = m_graph->add_node_and_edge(reshape_op, {rhs_gnode}); + rhs_gnode_ts = reshape_gnode; + } nnfusion::op::OpConfig::any myConfig; - myConfig["adj_x"]["b"] = adj_x; - myConfig["adj_y"]["b"] = adj_y; + // myConfig["adj_x"]["b"] = adj_x; + // myConfig["adj_y"]["b"] = adj_y; + myConfig["adj_x"]["b"] = false; + myConfig["adj_y"]["b"] = false; auto generic_op = std::make_shared( node.name(), "BatchMatMul", // select which existing kernels to use; myConfig); - auto generic_gnode = m_graph->add_node_and_edge(generic_op, {lhs_gnode, rhs_gnode}); + auto generic_gnode = + m_graph->add_node_and_edge(generic_op, {lhs_gnode_ts, rhs_gnode_ts}); NamedNodeVector ret{{node.name(), generic_gnode}}; return ret; } @@ -943,8 +995,17 @@ namespace nnfusion auto generic_op = std::make_shared( node.name(), "DepthwiseConv2dNative", op_config); + nnfusion::Shape filter_shape_ts{ + filter_shape[2], filter_shape[3], filter_shape[0], filter_shape[1]}; + + nnfusion::AxisVector filter_axis_order{2, 3, 0, 1}; + auto reshape_op = + std::make_shared(filter_axis_order, filter_shape_ts); + reshape_op->set_name(filter_gnode->get_name() + "_ts"); + auto filter_gnode_ts = m_graph->add_node_and_edge(reshape_op, {filter_gnode}); + auto conv_gnode = - m_graph->add_node_and_edge(generic_op, {input_gnode, filter_gnode}); + m_graph->add_node_and_edge(generic_op, {input_gnode, filter_gnode_ts}); // auto reshape_conv_gnode = BatchToTensorflow(is_nhwc, conv_gnode); // if (reshape_conv_gnode != nullptr && default_device != GENERIC_CPU && From 10b5d0b0645fbec6874105ea1d045d322a1c0576 Mon Sep 17 00:00:00 2001 From: Yuqing Date: Sun, 12 Dec 2021 07:34:26 +0000 Subject: [PATCH 07/24] add dump broadcast axis, add broadcast to kernel_db --- src/nnfusion/engine/pass/graph/dump_op.cpp | 10 ++++++++++ src/tools/nnfusion/kernel_db/convert_external.py | 6 +++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/nnfusion/engine/pass/graph/dump_op.cpp b/src/nnfusion/engine/pass/graph/dump_op.cpp index 0f4e45e95..bd0907b96 100644 --- a/src/nnfusion/engine/pass/graph/dump_op.cpp +++ b/src/nnfusion/engine/pass/graph/dump_op.cpp @@ -101,6 +101,16 @@ bool DumpOp::run_on_graph(std::shared_ptr& graph) out << "\t"; out << op->get_reduction_axes() << "\t"; } + else if (it->get_op_type() == "Broadcast") + { + auto op = static_pointer_cast(it->get_op_ptr()); + NNFUSION_CHECK_NOT_NULLPTR(op); + out << "\t"; + out << "\t"; + out << "\t"; + out << "\t"; + out << op->get_broadcast_axes() << "\t"; + } else if (it->get_op_type() == "DepthwiseConv2dNative") { auto op = static_pointer_cast(it->get_op_ptr()); diff --git a/src/tools/nnfusion/kernel_db/convert_external.py b/src/tools/nnfusion/kernel_db/convert_external.py index 2d9b7a369..9ecdd8aab 100644 --- a/src/tools/nnfusion/kernel_db/convert_external.py +++ b/src/tools/nnfusion/kernel_db/convert_external.py @@ -41,6 +41,10 @@ 'symbol': ['input0', 'output0'], 'dtype': ['float*', 'float*'] }, + "Broadcast": { + 'symbol': ['input0', 'output0'], + 'dtype': ['float*', 'float*'] + }, "Sum": { 'symbol': ['input0', 'output0'], 'dtype': ['float*', 'float*'] @@ -172,7 +176,7 @@ def gen_config(op_type, kernel, shared_memory, num_sync): config["out_shape"] = [kernel["parameters"]["out_shape"]] config[ "function_signature"] = "extern \"C\" __global__ void (float* __restrict__ input0, float* __restrict__ input1, float* __restrict__ output0)" - elif (op_type == "Relu"): + elif (op_type == "Relu" or op_type == "Broadcast"): config["in_shape"] = [kernel["parameters"]["input_shape"]] config["out_shape"] = [kernel["parameters"]["output_shape"]] config["function_signature"] = "extern \"C\" __global__ void (float* input0, float* output0)" From 552ab24630bc71f3b322b74ba85f0a6e707ef9c6 Mon Sep 17 00:00:00 2001 From: Yuqing Date: Sun, 12 Dec 2021 09:08:04 +0000 Subject: [PATCH 08/24] kernel name --- src/tools/nnfusion/kernel_db/parse_code.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/tools/nnfusion/kernel_db/parse_code.py b/src/tools/nnfusion/kernel_db/parse_code.py index 60e3c6571..8ca6e5607 100644 --- a/src/tools/nnfusion/kernel_db/parse_code.py +++ b/src/tools/nnfusion/kernel_db/parse_code.py @@ -12,7 +12,7 @@ import json import re import argparse -parser=argparse.ArgumentParser() + parser = argparse.ArgumentParser() parser.add_argument('--op_type', required=True, type=str, default='') parser.add_argument('--source_file', required=True, type=str, default='') @@ -77,11 +77,13 @@ if flag and line.startswith("}"): flag = False if line.startswith("extern \"C\" "): - line = line.replace("default_function_kernel0", tvm_func_name) match = re.search("__launch_bounds__\([0-9]*\) ", line) if match: lb = match.group() line = line.replace(lb, "") + kernel_name = re.search("void .*_kernel0", line).group() + # print(kernel_name) + line = line.replace(kernel_name, "void *" + tvm_func_name) code += line flag = True if "dim3 grid(" in line: From 8b6fb8f92f0a9af746aae3c248d9cd5de600fd25 Mon Sep 17 00:00:00 2001 From: Yuqing Date: Sun, 12 Dec 2021 09:10:32 +0000 Subject: [PATCH 09/24] fix bug --- src/tools/nnfusion/kernel_db/parse_code.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tools/nnfusion/kernel_db/parse_code.py b/src/tools/nnfusion/kernel_db/parse_code.py index 8ca6e5607..26b2274c7 100644 --- a/src/tools/nnfusion/kernel_db/parse_code.py +++ b/src/tools/nnfusion/kernel_db/parse_code.py @@ -83,7 +83,7 @@ line = line.replace(lb, "") kernel_name = re.search("void .*_kernel0", line).group() # print(kernel_name) - line = line.replace(kernel_name, "void *" + tvm_func_name) + line = line.replace(kernel_name, "void " + tvm_func_name) code += line flag = True if "dim3 grid(" in line: From d23b66cfe8913950ffe0bf5d61869f1fefb08f6f Mon Sep 17 00:00:00 2001 From: Yuqing Date: Sun, 12 Dec 2021 10:11:36 +0000 Subject: [PATCH 10/24] add white list --- src/nnfusion/engine/cache/manager.cpp | 6 +++++- src/nnfusion/engine/pass/graph/kernel_selection.cpp | 1 - src/tools/nnfusion/kernel_db/parse_code.py | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/nnfusion/engine/cache/manager.cpp b/src/nnfusion/engine/cache/manager.cpp index 319c3ca2e..133f52a2c 100644 --- a/src/nnfusion/engine/cache/manager.cpp +++ b/src/nnfusion/engine/cache/manager.cpp @@ -88,7 +88,11 @@ CREATE TABLE IF NOT EXISTS KernelCache( "MaxPool", "Fused_Convolution_Relu", "Fused_Convolution_Add_Relu", - "Matched_Pattern"}); + "Matched_Pattern", + "BatchMatMul", + "Broadcast", + "Sum", + "DepthwiseConv2dNative"}); } } diff --git a/src/nnfusion/engine/pass/graph/kernel_selection.cpp b/src/nnfusion/engine/pass/graph/kernel_selection.cpp index 076cee68f..54b21a096 100644 --- a/src/nnfusion/engine/pass/graph/kernel_selection.cpp +++ b/src/nnfusion/engine/pass/graph/kernel_selection.cpp @@ -319,7 +319,6 @@ pair { // fetch all available kernel entries from kernel cache DB auto fetched = cache_manager->fetch_all(identifier, get_device_str(devtype)); - // emit External kernels { for (auto kernel_entry : fetched) diff --git a/src/tools/nnfusion/kernel_db/parse_code.py b/src/tools/nnfusion/kernel_db/parse_code.py index 26b2274c7..e8941aa54 100644 --- a/src/tools/nnfusion/kernel_db/parse_code.py +++ b/src/tools/nnfusion/kernel_db/parse_code.py @@ -37,6 +37,7 @@ source_file = args.source_file json_file = args.json_file tvm_func_name = source_file.split("/")[-1][:-3] +tvm_func_name = tvm_func_name.replace("[", "_").replace("]", "_").replace(",", "_") info["tvm_func_name"] = tvm_func_name if op_type == "Dot" or op_type == "BatchMatMul": info["parameters"]["arg0_shape"] = args.input0_shape From 0719cf75bc9e1335170ab8513cc4559614197b76 Mon Sep 17 00:00:00 2001 From: Yuqing Date: Mon, 13 Dec 2021 02:44:36 +0000 Subject: [PATCH 11/24] remove sum parameters --- src/tools/nnfusion/kernel_db/convert_external.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tools/nnfusion/kernel_db/convert_external.py b/src/tools/nnfusion/kernel_db/convert_external.py index 9ecdd8aab..299c44c43 100644 --- a/src/tools/nnfusion/kernel_db/convert_external.py +++ b/src/tools/nnfusion/kernel_db/convert_external.py @@ -136,9 +136,9 @@ def gen_key(data, dtype="float"): for i in parameters["window_stride"]) + "}" key += "Shape{" + ", ".join(str(i) for i in parameters["padding_below"]) + "}" - elif op_type == "Sum": - key += "AxisSet{" + ", ".join(str(i) - for i in parameters["reduction_axis"]) + "}" + # elif op_type == "Sum": + # key += "AxisSet{" + ", ".join(str(i) + # for i in parameters["reduction_axis"]) + "}" else: pass From a19d3e3311000caed98270f41ba8f945af5a2679 Mon Sep 17 00:00:00 2001 From: Yuqing Date: Mon, 13 Dec 2021 03:30:46 +0000 Subject: [PATCH 12/24] add parameters --- .../nnfusion/kernel_db/convert_external.py | 21 +++++++++++++++---- src/tools/nnfusion/kernel_db/parse_code.py | 5 +++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/tools/nnfusion/kernel_db/convert_external.py b/src/tools/nnfusion/kernel_db/convert_external.py index 299c44c43..ebe3857a7 100644 --- a/src/tools/nnfusion/kernel_db/convert_external.py +++ b/src/tools/nnfusion/kernel_db/convert_external.py @@ -136,9 +136,12 @@ def gen_key(data, dtype="float"): for i in parameters["window_stride"]) + "}" key += "Shape{" + ", ".join(str(i) for i in parameters["padding_below"]) + "}" - # elif op_type == "Sum": - # key += "AxisSet{" + ", ".join(str(i) - # for i in parameters["reduction_axis"]) + "}" + elif op_type == "Sum": + key += "AxisSet{" + ", ".join(str(i) + for i in parameters["reduction_axis"]) + "}" + elif op_type == "Broadcast": + key += "AxisSet{" + ", ".join(str(i) + for i in parameters["broadcast_axis"]) + "}" else: pass @@ -176,7 +179,7 @@ def gen_config(op_type, kernel, shared_memory, num_sync): config["out_shape"] = [kernel["parameters"]["out_shape"]] config[ "function_signature"] = "extern \"C\" __global__ void (float* __restrict__ input0, float* __restrict__ input1, float* __restrict__ output0)" - elif (op_type == "Relu" or op_type == "Broadcast"): + elif (op_type == "Relu"): config["in_shape"] = [kernel["parameters"]["input_shape"]] config["out_shape"] = [kernel["parameters"]["output_shape"]] config["function_signature"] = "extern \"C\" __global__ void (float* input0, float* output0)" @@ -184,6 +187,16 @@ def gen_config(op_type, kernel, shared_memory, num_sync): config["in_shape"] = [kernel["parameters"]["input_shape"]] config["out_shape"] = [kernel["parameters"]["output_shape"]] config["function_signature"] = "extern \"C\" __global__ void (float* input0, float* output0)" + config["parameters"] = { + "reduction_axis" : kernel["parameters"]["reduction_axis"] + } + elif (op_type == "Broadcast"): + config["in_shape"] = [kernel["parameters"]["input_shape"]] + config["out_shape"] = [kernel["parameters"]["output_shape"]] + config["function_signature"] = "extern \"C\" __global__ void (float* input0, float* output0)" + config["parameters"] = { + "broadcast_axis" : kernel["parameters"]["broadcast_axis"] + } elif (op_type == "AvgPool" or op_type == "MaxPool"): config["in_shape"] = [kernel["parameters"]["input_shape"]] config["out_shape"] = [kernel["parameters"]["output_shape"]] diff --git a/src/tools/nnfusion/kernel_db/parse_code.py b/src/tools/nnfusion/kernel_db/parse_code.py index e8941aa54..bf6ab75c0 100644 --- a/src/tools/nnfusion/kernel_db/parse_code.py +++ b/src/tools/nnfusion/kernel_db/parse_code.py @@ -27,6 +27,7 @@ parser.add_argument("--dilation", nargs="*", type=int,default=[1, 1]) parser.add_argument("--window_shape", nargs="*", type=int,default=[1, 1]) parser.add_argument("--reduction_axis", nargs="*", type=int,default=[0]) +parser.add_argument("--broadcast_axis", nargs="*", type=int,default=[0]) args = parser.parse_args() @@ -62,6 +63,10 @@ info["parameters"]["input_shape"] = args.input0_shape info["parameters"]["output_shape"] = args.output0_shape info["parameters"]["reduction_axis"] = args.reduction_axis +elif op_type == "Broadcast": + info["parameters"]["input_shape"] = args.input0_shape + info["parameters"]["output_shape"] = args.output0_shape + info["parameters"]["broadcast_axis"] = args.broadcast_axis else: info["parameters"]["input_shape"] = args.input0_shape info["parameters"]["output_shape"] = args.output0_shape From ca5a052886f7726cf51779250879b936c270fe91 Mon Sep 17 00:00:00 2001 From: Yuqing Date: Mon, 13 Dec 2021 08:37:23 +0000 Subject: [PATCH 13/24] add create_sh.py --- src/nnfusion/core/kernels/kernel_emitter.cpp | 31 +++++++- .../nnfusion/kernel_db/convert_external.py | 12 +-- src/tools/nnfusion/kernel_db/create_sh.py | 75 +++++++++++++++++++ src/tools/nnfusion/kernel_db/parse_code.py | 2 +- 4 files changed, 111 insertions(+), 9 deletions(-) create mode 100644 src/tools/nnfusion/kernel_db/create_sh.py diff --git a/src/nnfusion/core/kernels/kernel_emitter.cpp b/src/nnfusion/core/kernels/kernel_emitter.cpp index 7c8b9545d..543fb8e06 100644 --- a/src/nnfusion/core/kernels/kernel_emitter.cpp +++ b/src/nnfusion/core/kernels/kernel_emitter.cpp @@ -3,6 +3,7 @@ #include "kernel_emitter.hpp" #include "nnfusion/engine/async_manager.hpp" +#include "nnfusion/core/operators/generic_op/generic_op.hpp" #include @@ -473,7 +474,7 @@ std::string nnfusion::kernels::KernelContext::generate_identifier() str << avgpool->get_window_shape(); str << avgpool->get_window_movement_strides(); str << avgpool->get_padding_below(); - str << avgpool->get_padding_above(); + // str << avgpool->get_padding_above(); identifier += str.str(); } else if (op_type == "MaxPool") @@ -484,7 +485,7 @@ std::string nnfusion::kernels::KernelContext::generate_identifier() str << maxpool->get_window_shape(); str << maxpool->get_window_movement_strides(); str << maxpool->get_padding_below(); - str << maxpool->get_padding_above(); + // str << maxpool->get_padding_above(); identifier += str.str(); } else if (op_type == "Dot") @@ -498,6 +499,32 @@ std::string nnfusion::kernels::KernelContext::generate_identifier() // ///\todo: need to encode dot reduction_axes_count? // identifier += str.str(); } + else if (op_type == "Sum") + { + auto op = std::static_pointer_cast(ctx->gnode->get_op_ptr()); + NNFUSION_CHECK_NOT_NULLPTR(op); + std::stringstream str; + str << op->get_reduction_axes(); + identifier += str.str(); + } + else if (op_type == "Broadcast") + { + auto op = std::static_pointer_cast(ctx->gnode->get_op_ptr()); + NNFUSION_CHECK_NOT_NULLPTR(op); + std::stringstream str; + str << op->get_broadcast_axes(); + identifier += str.str(); + } + else if (op_type == "DepthwiseConv2dNative") + { + auto op = std::static_pointer_cast(ctx->gnode->get_op_ptr()); + NNFUSION_CHECK_NOT_NULLPTR(op); + std::stringstream str; + str << op->localOpConfig.getRoot()["strides"]; + str << op->localOpConfig.getRoot()["dilations"]; + str << op->localOpConfig.getRoot()["padding_before"]; + identifier += str.str(); + } return identifier; } \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/convert_external.py b/src/tools/nnfusion/kernel_db/convert_external.py index ebe3857a7..f6732782f 100644 --- a/src/tools/nnfusion/kernel_db/convert_external.py +++ b/src/tools/nnfusion/kernel_db/convert_external.py @@ -123,12 +123,12 @@ def gen_key(data, dtype="float"): else: raise ("to be specified") elif op_type == "DepthwiseConv2dNative": - key += "".join(["Strides{", ", ".join(str(i) - for i in parameters["window_movement_strides"]), "}"]) - key += "".join(["Strides{", ", ".join(str(i) - for i in parameters["window_dilation_strides"]), "}"]) - key += "".join(["CoordinateDiff{", ", ".join(str(i) - for i in parameters["padding_below_diff"]), "}"]) + key += "".join(["[", ", ".join(str(i) + for i in parameters["window_movement_strides"]), "]"]) + key += "".join(["[", ", ".join(str(i) + for i in parameters["window_dilation_strides"]), "]"]) + key += "".join(["[", ", ".join(str(i) + for i in parameters["padding_below_diff"]), "]"]) elif op_type == "AvgPool" or op_type == "MaxPool": key += "Shape{" + ", ".join(str(i) for i in parameters["window_shape"]) + "}" diff --git a/src/tools/nnfusion/kernel_db/create_sh.py b/src/tools/nnfusion/kernel_db/create_sh.py new file mode 100644 index 000000000..49fcc43da --- /dev/null +++ b/src/tools/nnfusion/kernel_db/create_sh.py @@ -0,0 +1,75 @@ +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument('--src_file', required=True, type=str, default='') +parser.add_argument('--dst_file', required=True, type=str, default='') +args = parser.parse_args() + +src_file = args.src_file +dst_file = args.dst_file + +with open(dst_file, 'w') as fw: + with open(src_file, 'r') as f: + for line in f.readlines(): + cmd = "" + op_type, input, output, fused_nodes, strides, pad, dilation_window, axis, format = line.split('\t') + inputs = input.split('Shape') + if op_type == 'Convolution': + if len(inputs) > 3: + op_type = 'Fused_Convolution_Add' + if fused_nodes == 'relu': + op_type = 'Fused_Convolution_Add_Relu' + elif fused_nodes == 'relu': + op_type = 'Fused_Convolution_Relu' + cmd += 'python parse_code.py --op_type ' + op_type + ' ' + input0 = inputs[1][1:-1].replace(',', '') + input1 = inputs[2][1:-1].replace(',', '') + cmd += '--input0_shape ' + input0 + ' ' + cmd += '--input1_shape ' + input1 + ' ' + cmd += '--output0_shape ' + output[6:-1].replace(',', '') + ' ' + + cmd += '--stride ' + strides[8:-1].replace(',','') + ' ' + cmd += '--padding ' + pad[15: 19].replace(',','') + ' ' + cmd += '--dilation ' + dilation_window[8: -1].replace(',', '') + ' ' + cmd += '\n' + elif op_type == 'DepthwiseConv2dNative': + cmd += 'python parse_code.py --op_type ' + op_type + ' ' + input0 = inputs[1][1:-1].replace(',', '') + input1 = inputs[2][1:-1].replace(',', '') + cmd += '--input0_shape ' + input0 + ' ' + cmd += '--input1_shape ' + input1 + ' ' + cmd += '--output0_shape ' + output[6:-1].replace(',', '') + ' ' + + cmd += '--stride ' + strides[1:-1].replace(',',' ') + ' ' + cmd += '--padding ' + pad[1: 4].replace(',',' ') + ' ' + cmd += '--dilation ' + dilation_window[1: -1].replace(',', ' ') + ' ' + cmd += '\n' + elif op_type == 'MaxPool' or op_type == 'AvgPool': + cmd += 'python parse_code.py --op_type ' + op_type + ' ' + input0 = inputs[1][1:-1].replace(',', '') + cmd += '--input0_shape ' + input0 + ' ' + cmd += '--output0_shape ' + output[6:-1].replace(',', '') + ' ' + cmd += '--window_shape ' + dilation_window[6:-1].replace(',', '') + ' ' + cmd += '--stride ' + strides[8:-1].replace(',','') + ' ' + cmd += '--padding ' + pad[6: 10].replace(',','') + ' ' + cmd += '\n' + elif op_type == "Sum": + cmd += 'python parse_code.py --op_type ' + op_type + ' ' + input0 = inputs[1][1:-1].replace(',', '') + cmd += '--input0_shape ' + input0 + ' ' + cmd += '--output0_shape ' + output[6:-1].replace(',', '') + ' ' + cmd += '--reduction_axis ' + axis[8:-1].replace(',', '') + ' ' + cmd += '\n' + elif op_type == "Dot" or op_type == "BatchMatMul": + cmd += 'python parse_code.py --op_type ' + op_type + ' ' + input0 = inputs[1][1:-1].replace(',', '') + input1 = inputs[2][1:-1].replace(',', '') + cmd += '--input0_shape ' + input0 + ' ' + cmd += '--input1_shape ' + input1 + ' ' + cmd += '--output0_shape ' + output[6:-1].replace(',', '') + ' ' + cmd += '\n' + fw.write(cmd) + + + + diff --git a/src/tools/nnfusion/kernel_db/parse_code.py b/src/tools/nnfusion/kernel_db/parse_code.py index bf6ab75c0..9ff655c32 100644 --- a/src/tools/nnfusion/kernel_db/parse_code.py +++ b/src/tools/nnfusion/kernel_db/parse_code.py @@ -46,7 +46,7 @@ info["parameters"]["out_shape"] = args.output0_shape info["parameters"]["transpose_A"] = args.transpose_A info["parameters"]["transpose_B"] = args.transpose_B -elif op_type == "Convolution" or op_type == "DepthwiseConv2dNative": +elif op_type == "Convolution" or op_type == "DepthwiseConv2dNative" or op_type == "Fused_Convolution_Add" or op_type == "Fused_Convolution_Relu" or op_type == "Fused_Convolution_Add_Relu": info["parameters"]["input_shape"] = args.input0_shape info["parameters"]["filter_shape"] = args.input1_shape info["parameters"]["output_shape"] = args.output0_shape From 43c0f88803b9197d4206d8e33a45c4c28e371cf1 Mon Sep 17 00:00:00 2001 From: Yuqing Date: Mon, 13 Dec 2021 12:41:51 +0000 Subject: [PATCH 14/24] fix bug --- src/tools/nnfusion/kernel_db/convert_external.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tools/nnfusion/kernel_db/convert_external.py b/src/tools/nnfusion/kernel_db/convert_external.py index f6732782f..f59973c41 100644 --- a/src/tools/nnfusion/kernel_db/convert_external.py +++ b/src/tools/nnfusion/kernel_db/convert_external.py @@ -123,11 +123,11 @@ def gen_key(data, dtype="float"): else: raise ("to be specified") elif op_type == "DepthwiseConv2dNative": - key += "".join(["[", ", ".join(str(i) + key += "".join(["[", ",".join(str(i) for i in parameters["window_movement_strides"]), "]"]) - key += "".join(["[", ", ".join(str(i) + key += "".join(["[", ",".join(str(i) for i in parameters["window_dilation_strides"]), "]"]) - key += "".join(["[", ", ".join(str(i) + key += "".join(["[", ",".join(str(i) for i in parameters["padding_below_diff"]), "]"]) elif op_type == "AvgPool" or op_type == "MaxPool": key += "Shape{" + ", ".join(str(i) From a9c3063cdc9a47010acc5d03d5e31724384db28f Mon Sep 17 00:00:00 2001 From: Yuqing Date: Mon, 13 Dec 2021 12:59:51 +0000 Subject: [PATCH 15/24] adhoc change for depthwise --- .../core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp index 772f9511b..fdd74353f 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp @@ -14,7 +14,10 @@ cuda::DepthwiseConv2dNative::DepthwiseConv2dNative(shared_ptr ctx const Shape input_shape = Shape(ctx->inputs[0]->get_shape()); // [ filter_rows, filter_cols, in_depth, depth_multiplier] - const Shape filter_shape = Shape(ctx->inputs[1]->get_shape()); + // const Shape filter_shape = Shape(ctx->inputs[1]->get_shape()); + // ad_hoc + const Shape filter_shape_ori = Shape(ctx->inputs[1]->get_shape()); + Shape filter_shape = Shape{filter_shape_ori[2], filter_shape_ori[3], filter_shape_ori[0], filter_shape_ori[1]}; const Shape output_shape = Shape(ctx->outputs[0]->get_shape()); data_format = op->localOpConfig.getRoot()["data_format"]; From 9a022c3ad5f20e214ff8e1f2a81d8aac97ebb491 Mon Sep 17 00:00:00 2001 From: Yuqing Date: Mon, 13 Dec 2021 14:22:07 +0000 Subject: [PATCH 16/24] fix bug --- .../kernels/cuda_gpu/kernels/elementwise.hpp | 26 +++++++++--- .../cuda_gpu/kernels/elementwise_fused.cpp | 40 ++++++++++++++++--- src/nnfusion/engine/cache/manager.cpp | 5 ++- src/tools/nnfusion/kernel_db/profile.py | 2 +- 4 files changed, 59 insertions(+), 14 deletions(-) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.hpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.hpp index 4a0efb6b2..ce38a67b8 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.hpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.hpp @@ -70,12 +70,19 @@ namespace nnfusion { std::string tid = - "blockIdx.x * " + std::to_string(blocks) + " + threadIdx.x"; + "blockIdx.x * " + std::to_string(blocks) + " * 2 + threadIdx.x"; + std::string tid1 = + "blockIdx.x * " + std::to_string(blocks) + " * 2 + threadIdx.x + 128"; if (grids == 1) + { tid = "threadIdx.x"; + tid1 = "threadIdx.x + 128"; + } if (bound) + { lu << "if (" << tid << " >= " << bound << ") return;"; - + lu << "if (" << tid1 << " >= " << bound << ") return;"; + } { std::string invoke_func = op; if (m_context->gnode->get_op_type() == "Convert") @@ -89,6 +96,13 @@ namespace nnfusion lu << "input" << i << "[" << tid << "], "; } lu << "input" << num_inputs - 1 << "[" << tid << "]);\n"; + + lu << "output0[" << tid1 << "] = " << invoke_func << "("; + for (size_t i = 0; i < num_inputs - 1; i++) + { + lu << "input" << i << "[" << tid1 << "], "; + } + lu << "input" << num_inputs - 1 << "[" << tid1 << "]);\n"; } } return lu_; @@ -134,26 +148,28 @@ namespace nnfusion { uint32_t num_ele = static_cast( nnfusion::shape_size(m_context->outputs[0]->get_shape())); - for (int i = 512; i >= 64; i >>= 1) + for (int i = 128; i >= 64; i >>= 1) { if (num_ele % i == 0) { grids = num_ele / i, blocks = i, bound = 0; + grids = grids / 2; return; } } - for (int i = 512; i >= 32; i--) + for (int i = 128; i >= 32; i--) { if (num_ele % i == 0) { grids = num_ele / i, blocks = i, bound = 0; + grids = grids / 2; return; } } if (num_ele < 32) grids = 1, blocks = num_ele, bound = 0; else - grids = (num_ele + 255) / 256, blocks = 256, bound = 1; + grids = (num_ele + 255) / 256, blocks = 128, bound = 1; } // shared_ptr kernel_ctx; diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp index 4212307fa..47c379e62 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp @@ -72,14 +72,17 @@ LanguageUnit_p ElementWiseFused::emit_function_body() if (grids == 1) { lu << "int tid = threadIdx.x;\n"; + lu << "int tid1 = threadIdx.x + 128;\n"; } else { - lu << "int tid = blockIdx.x * " << std::to_string(blocks) << " + threadIdx.x;\n"; + lu << "int tid = blockIdx.x * " << std::to_string(blocks) << " * 2 + threadIdx.x;\n"; + lu << "int tid1 = blockIdx.x * " << std::to_string(blocks) << "* 2 + threadIdx.x + 128;\n"; } if (bound) { lu << "if (tid >= " << bound << ") return;\n"; + lu << "if (tid1 >= " << bound << ") return;\n"; } for (auto op_ctx : m_gnode->get_op_contexts()) @@ -87,15 +90,17 @@ LanguageUnit_p ElementWiseFused::emit_function_body() auto& out_tw = op_ctx->outputs[0]; if (auto bc = std::dynamic_pointer_cast(op_ctx->op)) { - std::string index = ""; + std::string index = "", index1 = ""; if (bc->is_inner_broadcast()) { index += "[tid / " + std::to_string(bc->get_inner_broadcast_size()) + "]"; + index1 += "[tid1 / " + std::to_string(bc->get_inner_broadcast_size()) + "]"; } else { NNFUSION_CHECK(bc->is_outer_broadcast()); index += "[tid % " + std::to_string(bc->get_outer_broadcast_size()) + "]"; + index1 += "[tid1 % " + std::to_string(bc->get_outer_broadcast_size()) + "]"; } local_tensors[out_tw->get_name()] = "temp" + std::to_string(temp_tensor_id++); auto& in_tw = op_ctx->inputs[0]; @@ -104,6 +109,9 @@ LanguageUnit_p ElementWiseFused::emit_function_body() lu << out_tw->get_element_type().c_type_string() << " " << local_tensors[out_tw->get_name()] << " = " << in_args[in_tw->get_name()] << index << ";\n"; + lu << out_tw->get_element_type().c_type_string() << " " + << local_tensors[out_tw->get_name()] << "_1 = " << in_args[in_tw->get_name()] << index1 + << ";\n"; } else if (auto rs = std::dynamic_pointer_cast(op_ctx->op)) { @@ -150,23 +158,29 @@ LanguageUnit_p ElementWiseFused::emit_function_body() invoke_func = op_kernel.first; } local_tensors[out_tw->get_name()] = "temp" + std::to_string(temp_tensor_id++); - std::vector input_args; + std::vector input_args, input_args1; for (int i = 0; i < op_ctx->inputs.size(); i++) { auto& in_tw = op_ctx->inputs[i]; if (in_args.count(in_tw->get_name()) > 0) { input_args.push_back(in_args[in_tw->get_name()] + "[tid]"); + input_args1.push_back(in_args[in_tw->get_name()] + "[tid1]"); } else { NNFUSION_CHECK(local_tensors.count(in_tw->get_name()) > 0); input_args.push_back(local_tensors[in_tw->get_name()]); + input_args1.push_back(local_tensors[in_tw->get_name()] + "_1"); } } lu << out_tw->get_element_type().c_type_string() << " " << local_tensors[out_tw->get_name()] << " = " << invoke_func << "(" << join(input_args, ", ") << ");\n"; + + lu << out_tw->get_element_type().c_type_string() << " " + << local_tensors[out_tw->get_name()] << "_1 = " << invoke_func << "(" + << join(input_args1, ", ") << ");\n"; } } @@ -183,6 +197,18 @@ LanguageUnit_p ElementWiseFused::emit_function_body() << lu.get_code() << " " << pair.first; lu << in_args[pair.first] << "[tid];\n"; } + + lu << pair.second << "[tid1] = "; + if (local_tensors.count(pair.first) > 0) + { + lu << local_tensors[pair.first] << "_1;\n"; + } + else + { + NNFUSION_CHECK(in_args.count(pair.first) > 0) << m_context->gnode->get_name() << " " + << lu.get_code() << " " << pair.first << "_1"; + lu << in_args[pair.first] << "_1[tid1];\n"; + } } return lu_; @@ -261,26 +287,28 @@ void ElementWiseFused::compute_best_config(int& grids, int& blocks, int& bound) { uint32_t num_ele = static_cast(nnfusion::shape_size(m_context->outputs[0]->get_shape())); - for (int i = 512; i >= 64; i >>= 1) + for (int i = 128; i >= 64; i >>= 1) { if (num_ele % i == 0) { grids = num_ele / i, blocks = i, bound = 0; + grids = grids / 2; return; } } - for (int i = 512; i >= 32; i--) + for (int i = 128; i >= 32; i--) { if (num_ele % i == 0) { grids = num_ele / i, blocks = i, bound = 0; + grids = grids / 2; return; } } if (num_ele < 32) grids = 1, blocks = num_ele, bound = 0; else - grids = (num_ele + 255) / 256, blocks = 256, bound = 1; + grids = (num_ele + 255) / 256, blocks = 128, bound = 1; } REGISTER_KERNEL_EMITTER( diff --git a/src/nnfusion/engine/cache/manager.cpp b/src/nnfusion/engine/cache/manager.cpp index 133f52a2c..ea8f16450 100644 --- a/src/nnfusion/engine/cache/manager.cpp +++ b/src/nnfusion/engine/cache/manager.cpp @@ -88,6 +88,7 @@ CREATE TABLE IF NOT EXISTS KernelCache( "MaxPool", "Fused_Convolution_Relu", "Fused_Convolution_Add_Relu", + "Fused_Convolution_Add", "Matched_Pattern", "BatchMatMul", "Broadcast", @@ -105,7 +106,7 @@ KernelCacheManager::~KernelCacheManager() std::vector KernelCacheManager::fetch_all(std::string identifier, std::string device_type) { - NNFUSION_LOG(DEBUG) << "Trying to fetch kernel " << identifier + NNFUSION_LOG(INFO) << "Trying to fetch kernel " << identifier << " on DeviceType: " << device_type; sqlite3_stmt* pStmt; const char* fetch = R"( @@ -134,7 +135,7 @@ SELECT Key, Identifier, OpType, Attributes, Source, DeviceType, Function, Tags, if (SupportOpList.find(fetched_kernel->op_type) == SupportOpList.end()) { - NNFUSION_LOG(DEBUG) << "Unsupported op_type: " << fetched_kernel->op_type + NNFUSION_LOG(INFO) << "Unsupported op_type: " << fetched_kernel->op_type << ", ingore this fetch"; fetched.clear(); break; diff --git a/src/tools/nnfusion/kernel_db/profile.py b/src/tools/nnfusion/kernel_db/profile.py index d0e55d40d..b8b149127 100644 --- a/src/tools/nnfusion/kernel_db/profile.py +++ b/src/tools/nnfusion/kernel_db/profile.py @@ -134,7 +134,7 @@ def prepare_file(signature, code, config, path, parse=False): for shape in config["in_shape"]+config["out_shape"]: bytes_count.append(prod(shape)*4 + bytes_count[-1]) profile_kernel = profile_kernel.replace( - "__maxbytes__", str(bytes_count[-1])) + "__maxbytes__", str(bytes_count[-1] * 2)) init_input = "" input_parameters = "" From c084ca5771f7efc7979d9f95991d441507f051e9 Mon Sep 17 00:00:00 2001 From: Yuqing Date: Tue, 14 Dec 2021 02:32:26 +0000 Subject: [PATCH 17/24] fix bug --- src/nnfusion/engine/pass/graph/pattern_substitution.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nnfusion/engine/pass/graph/pattern_substitution.cpp b/src/nnfusion/engine/pass/graph/pattern_substitution.cpp index 1f34d267e..d706d7461 100644 --- a/src/nnfusion/engine/pass/graph/pattern_substitution.cpp +++ b/src/nnfusion/engine/pass/graph/pattern_substitution.cpp @@ -151,7 +151,7 @@ class PatternOptimizer { // Todo: more tags, more platform std::set tags = {}; - auto fetched_kernel = kernel_db->fetch_with_tags(identifier, "CUDA", tags); + auto fetched_kernel = kernel_db->fetch_with_tags(identifier, "CUDA_GPU", tags); if (fetched_kernel != nullptr) { NNFUSION_CHECK(fetched_kernel->function != ""); From 9aa88e468a47b99f94366be70c2645874778bb12 Mon Sep 17 00:00:00 2001 From: Yuqing Date: Tue, 14 Dec 2021 05:15:08 +0000 Subject: [PATCH 18/24] fix bug --- src/tools/nnfusion/kernel_db/profile.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/tools/nnfusion/kernel_db/profile.py b/src/tools/nnfusion/kernel_db/profile.py index b8b149127..eeb11f0c4 100644 --- a/src/tools/nnfusion/kernel_db/profile.py +++ b/src/tools/nnfusion/kernel_db/profile.py @@ -131,10 +131,13 @@ def prepare_file(signature, code, config, path, parse=False): f.write(profile_makefile) bytes_count = [0] + mul = 1 + if len(config["in_shape"]) + len(config["out_shape"]) == 2: + mul = 2 for shape in config["in_shape"]+config["out_shape"]: bytes_count.append(prod(shape)*4 + bytes_count[-1]) profile_kernel = profile_kernel.replace( - "__maxbytes__", str(bytes_count[-1] * 2)) + "__maxbytes__", str(bytes_count[-1] * mul)) init_input = "" input_parameters = "" From 9c9a2c43c9b4bb70f70aa367baf90d9ac6408002 Mon Sep 17 00:00:00 2001 From: Yuqing Date: Tue, 14 Dec 2021 07:53:49 +0000 Subject: [PATCH 19/24] disable broadcast fusion --- .../engine/pass/graph/runtime_const_folding_pass.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/nnfusion/engine/pass/graph/runtime_const_folding_pass.cpp b/src/nnfusion/engine/pass/graph/runtime_const_folding_pass.cpp index 217a2f26d..0e98860f1 100644 --- a/src/nnfusion/engine/pass/graph/runtime_const_folding_pass.cpp +++ b/src/nnfusion/engine/pass/graph/runtime_const_folding_pass.cpp @@ -245,6 +245,13 @@ bool RuntimeConstantFoldingPass::run_on_graph(std::shared_ptr& graph) std::set> blocklist_nodes = {}; for (auto& node : graph->get_outputs()) blocklist_nodes.insert(node); + for (auto& node : graph->get_ordered_ops()) + { + if (node->get_op_type() == "Broadcast") + { + blocklist_nodes.insert(node); + } + } int folding_cnt; do From f476c4e339f31d3ddc2c6518c705b49d6a9f6347 Mon Sep 17 00:00:00 2001 From: Yuqing Date: Tue, 14 Dec 2021 08:15:22 +0000 Subject: [PATCH 20/24] revert --- src/nnfusion/engine/cache/manager.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nnfusion/engine/cache/manager.cpp b/src/nnfusion/engine/cache/manager.cpp index ea8f16450..6467f1f31 100644 --- a/src/nnfusion/engine/cache/manager.cpp +++ b/src/nnfusion/engine/cache/manager.cpp @@ -106,7 +106,7 @@ KernelCacheManager::~KernelCacheManager() std::vector KernelCacheManager::fetch_all(std::string identifier, std::string device_type) { - NNFUSION_LOG(INFO) << "Trying to fetch kernel " << identifier + NNFUSION_LOG(DEBUG) << "Trying to fetch kernel " << identifier << " on DeviceType: " << device_type; sqlite3_stmt* pStmt; const char* fetch = R"( @@ -136,7 +136,7 @@ SELECT Key, Identifier, OpType, Attributes, Source, DeviceType, Function, Tags, if (SupportOpList.find(fetched_kernel->op_type) == SupportOpList.end()) { NNFUSION_LOG(INFO) << "Unsupported op_type: " << fetched_kernel->op_type - << ", ingore this fetch"; + << ", ingore this fetch"; fetched.clear(); break; } From 896f1305d05e58a98814e5a32e6be5c37b47939e Mon Sep 17 00:00:00 2001 From: Yuqing Date: Tue, 14 Dec 2021 08:43:21 +0000 Subject: [PATCH 21/24] style --- .../core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp | 5 +++-- .../core/kernels/cuda_gpu/kernels/elementwise_fused.cpp | 7 ++++--- src/nnfusion/core/kernels/kernel_emitter.cpp | 2 +- src/nnfusion/engine/pass/graph/pattern_substitution.cpp | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp index fdd74353f..ee8c08faf 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp @@ -15,9 +15,10 @@ cuda::DepthwiseConv2dNative::DepthwiseConv2dNative(shared_ptr ctx const Shape input_shape = Shape(ctx->inputs[0]->get_shape()); // [ filter_rows, filter_cols, in_depth, depth_multiplier] // const Shape filter_shape = Shape(ctx->inputs[1]->get_shape()); - // ad_hoc + // ad_hoc const Shape filter_shape_ori = Shape(ctx->inputs[1]->get_shape()); - Shape filter_shape = Shape{filter_shape_ori[2], filter_shape_ori[3], filter_shape_ori[0], filter_shape_ori[1]}; + Shape filter_shape = + Shape{filter_shape_ori[2], filter_shape_ori[3], filter_shape_ori[0], filter_shape_ori[1]}; const Shape output_shape = Shape(ctx->outputs[0]->get_shape()); data_format = op->localOpConfig.getRoot()["data_format"]; diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp index 47c379e62..e3379cb4e 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp @@ -110,8 +110,8 @@ LanguageUnit_p ElementWiseFused::emit_function_body() << local_tensors[out_tw->get_name()] << " = " << in_args[in_tw->get_name()] << index << ";\n"; lu << out_tw->get_element_type().c_type_string() << " " - << local_tensors[out_tw->get_name()] << "_1 = " << in_args[in_tw->get_name()] << index1 - << ";\n"; + << local_tensors[out_tw->get_name()] << "_1 = " << in_args[in_tw->get_name()] + << index1 << ";\n"; } else if (auto rs = std::dynamic_pointer_cast(op_ctx->op)) { @@ -206,7 +206,8 @@ LanguageUnit_p ElementWiseFused::emit_function_body() else { NNFUSION_CHECK(in_args.count(pair.first) > 0) << m_context->gnode->get_name() << " " - << lu.get_code() << " " << pair.first << "_1"; + << lu.get_code() << " " << pair.first + << "_1"; lu << in_args[pair.first] << "_1[tid1];\n"; } } diff --git a/src/nnfusion/core/kernels/kernel_emitter.cpp b/src/nnfusion/core/kernels/kernel_emitter.cpp index 543fb8e06..eac4896bb 100644 --- a/src/nnfusion/core/kernels/kernel_emitter.cpp +++ b/src/nnfusion/core/kernels/kernel_emitter.cpp @@ -2,8 +2,8 @@ // Licensed under the MIT License. #include "kernel_emitter.hpp" -#include "nnfusion/engine/async_manager.hpp" #include "nnfusion/core/operators/generic_op/generic_op.hpp" +#include "nnfusion/engine/async_manager.hpp" #include diff --git a/src/nnfusion/engine/pass/graph/pattern_substitution.cpp b/src/nnfusion/engine/pass/graph/pattern_substitution.cpp index d706d7461..6528727a3 100644 --- a/src/nnfusion/engine/pass/graph/pattern_substitution.cpp +++ b/src/nnfusion/engine/pass/graph/pattern_substitution.cpp @@ -155,7 +155,7 @@ class PatternOptimizer if (fetched_kernel != nullptr) { NNFUSION_CHECK(fetched_kernel->function != ""); - NNFUSION_LOG(INFO) << "Substitution applied: " << identifier; + NNFUSION_LOG(DEBUG) << "Substitution applied: " << identifier; return Substitution(matched, identifier); } } From 01485bce907bcdea44a6fe5cac75dcf74c995632 Mon Sep 17 00:00:00 2001 From: Yuqing Date: Tue, 14 Dec 2021 10:40:55 +0000 Subject: [PATCH 22/24] adhoc for broadcast --- .../pass/graph/batchnorm_inference_folding_pass.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp b/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp index 86340b7b2..6f691b32d 100644 --- a/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp +++ b/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp @@ -3,6 +3,7 @@ #include "batchnorm_inference_folding_pass.hpp" #include +#include "nnfusion/core/kernels/cuda_gpu/kernels/anyop.hpp" #include "nnfusion/core/operators/op_define/add.hpp" #include "nnfusion/core/operators/op_define/batch_norm.hpp" #include "nnfusion/core/operators/op_define/broadcast.hpp" @@ -621,6 +622,11 @@ class BatchNormInferenceOptimizer auto new_broadcast_gnode = m_graph->add_node_and_edge( std::make_shared(conv_output_shape, broadcast_axes), {new_conv_bias_gnode}); + shared_ptr ke_ctx(new KernelContext(new_broadcast_gnode)); + KernelEmitter::Pointer any_op_ke = std::make_shared(ke_ctx); + any_op_ke->get_or_emit_source(); + (*new_broadcast_gnode)["Kernel_Selection_Result"] = std::make_pair(NNFusion_DeviceType::CUDA_GPU, any_op_ke); + m_nodes.resize(m_graph->get_max_node_id()); m_nodes[new_broadcast_gnode->get_id()] = std::make_shared(); m_nodes[new_broadcast_gnode->get_id()]->node = new_broadcast_gnode; @@ -790,6 +796,10 @@ class BatchNormInferenceOptimizer auto new_broadcast_gnode = m_graph->add_node_and_edge( std::make_shared(conv_output_shape, broadcast_axes), {bn_node->get_in_edge(1)->get_src()}); + shared_ptr ke_ctx(new KernelContext(new_broadcast_gnode)); + KernelEmitter::Pointer any_op_ke = std::make_shared(ke_ctx); + any_op_ke->get_or_emit_source(); + (*new_broadcast_gnode)["Kernel_Selection_Result"] = std::make_pair(NNFusion_DeviceType::CUDA_GPU, any_op_ke);; m_nodes.resize(m_graph->get_max_node_id()); m_nodes[new_broadcast_gnode->get_id()] = std::make_shared(); m_nodes[new_broadcast_gnode->get_id()]->node = new_broadcast_gnode; From 6d3ab6b3e3ab21a528d715bbf4236086061eed93 Mon Sep 17 00:00:00 2001 From: Yuqing Date: Thu, 16 Dec 2021 04:09:44 +0000 Subject: [PATCH 23/24] add kernel json --- src/tools/nnfusion/kernel_db/example.json | 1 + ...68,42,42]_[168,168,1,1]_[128,168,42,42].cu | 265 +++++++++++++++ ...36,21,21]_[336,336,1,1]_[128,336,21,21].cu | 176 ++++++++++ ...72,11,11]_[672,672,1,1]_[128,672,11,11].cu | 159 +++++++++ ...28,84,42,42]_[84,84,1,1]_[128,84,42,42].cu | 157 +++++++++ src/tools/nnfusion/kernel_db/parse_bert.sh | 12 + src/tools/nnfusion/kernel_db/parse_lstm.sh | 1 + src/tools/nnfusion/kernel_db/parse_nas.sh | 64 ++++ src/tools/nnfusion/kernel_db/parse_res.sh | 26 ++ src/tools/nnfusion/kernel_db/parser.out | 317 ++++++++++++++++++ src/tools/nnfusion/kernel_db/parsetab.py | 41 +++ .../nnfusion/kernel_db/roller_bert/convert.sh | 11 + .../kernel_db/roller_bert/convert_ansor.sh | 7 + ...,512]_[128,16,512,64]_[128,16,512,64].json | 1 + ...,64]_[128,16,512,64]_[128,16,512,512].json | 1 + ...adcast_[128,512,512]_[128,16,512,512].json | 1 + ...ler_Broadcast_[128,512]_[128,512,512].json | 1 + ...[65536,1024]_[1024,1024]_[65536,1024].json | 1 + ...[65536,1024]_[1024,4096]_[65536,1024].json | 1 + ...r_Dot_[65536,2]_[2,1024]_[65536,1024].json | 1 + ...5536,30522]_[30522,1024]_[65536,1024].json | 1 + ...[65536,4096]_[4096,1024]_[65536,1024].json | 1 + .../roller_Sum_[128,512,1024]_[128,512].json | 1 + .../roller_Sum_[65536,1024]_[65536].json | 1 + ...ler_Dot_[128,256]_[256,256]_[256,256].json | 1 + .../nnfusion/kernel_db/roller_nas/convert.sh | 58 ++++ ...ool_[128,1008,42,42]_[128,1008,21,21].json | 1 + ...gPool_[128,168,42,42]_[128,168,42,42].json | 1 + ...gPool_[128,168,83,83]_[128,168,42,42].json | 1 + ...ool_[128,2016,21,21]_[128,2016,11,11].json | 1 + ...gPool_[128,336,21,21]_[128,336,21,21].json | 1 + ...gPool_[128,336,42,42]_[128,336,21,21].json | 1 + ...gPool_[128,42,165,165]_[128,42,83,83].json | 1 + ...AvgPool_[128,42,83,83]_[128,42,83,83].json | 1 + ...gPool_[128,672,11,11]_[128,672,11,11].json | 1 + ...gPool_[128,672,21,21]_[128,672,11,11].json | 1 + ...AvgPool_[128,84,42,42]_[128,84,42,42].json | 1 + ...AvgPool_[128,84,83,83]_[128,84,42,42].json | 1 + ...gPool_[128,96,165,165]_[128,96,83,83].json | 1 + ...21,21]_[168,1008,1,1]_[128,168,21,21].json | 1 + ...42,42]_[168,1008,1,1]_[128,168,42,42].json | 1 + ...42,42]_[336,1008,1,1]_[128,336,42,42].json | 1 + ...21,21]_[336,1344,1,1]_[128,336,21,21].json | 1 + ...2]_[168,168,1,1]_[128,168,42,42]_bias.json | 1 + ...2]_[168,168,1,1]_[128,168,42,42]_relu.json | 1 + ...68,42,42]_[84,168,1,1]_[128,84,42,42].json | 1 + ...68,83,83]_[84,168,1,1]_[128,84,83,83].json | 1 + ...11,11]_[336,2016,1,1]_[128,336,11,11].json | 1 + ...21,21]_[336,2016,1,1]_[128,336,21,21].json | 1 + ...21,21]_[672,2016,1,1]_[128,672,21,21].json | 1 + ...11,11]_[672,2688,1,1]_[128,672,11,11].json | 1 + ...331]_[96,3,3,3]_[128,96,165,165]_relu.json | 1 + ...1]_[336,336,1,1]_[128,336,21,21]_bias.json | 1 + ...1]_[336,336,1,1]_[128,336,21,21]_relu.json | 1 + ...,42,42]_[168,336,1,1]_[128,168,42,42].json | 1 + ...11,11]_[672,4032,1,1]_[128,672,11,11].json | 1 + ...,42,83,83]_[42,42,1,1]_[128,42,83,83].json | 1 + ...1]_[672,672,1,1]_[128,672,11,11]_bias.json | 1 + ...1]_[672,672,1,1]_[128,672,11,11]_relu.json | 1 + ...2,42]_[84,84,1,1]_[128,84,42,42]_bias.json | 1 + ...2,42]_[84,84,1,1]_[128,84,42,42]_relu.json | 1 + ...165,165]_[42,96,1,1]_[128,42,165,165].json | 1 + ...,96,83,83]_[42,96,1,1]_[128,42,83,83].json | 1 + ...68,42,42]_[3,3,168,1]_[128,168,42,42].json | 1 + ...68,42,42]_[5,5,168,1]_[128,168,42,42].json | 1 + ...36,21,21]_[3,3,336,1]_[128,336,21,21].json | 1 + ...36,21,21]_[5,5,336,1]_[128,336,21,21].json | 1 + ...36,21,21]_[7,7,336,1]_[128,336,21,21].json | 1 + ...36,45,45]_[5,5,336,1]_[128,336,21,21].json | 1 + ...36,47,47]_[7,7,336,1]_[128,336,21,21].json | 1 + ...42,165,165]_[5,5,42,1]_[128,42,83,83].json | 1 + ...8,42,83,83]_[3,3,42,1]_[128,42,83,83].json | 1 + ...8,42,83,83]_[5,5,42,1]_[128,42,83,83].json | 1 + ...8,42,83,83]_[7,7,42,1]_[128,42,83,83].json | 1 + ...72,11,11]_[3,3,672,1]_[128,672,11,11].json | 1 + ...72,11,11]_[5,5,672,1]_[128,672,11,11].json | 1 + ...72,11,11]_[7,7,672,1]_[128,672,11,11].json | 1 + ...72,21,21]_[5,5,672,1]_[128,672,11,11].json | 1 + ...72,21,21]_[7,7,672,1]_[128,672,11,11].json | 1 + ...8,84,42,42]_[3,3,84,1]_[128,84,42,42].json | 1 + ...8,84,42,42]_[5,5,84,1]_[128,84,42,42].json | 1 + ...8,84,42,42]_[7,7,84,1]_[128,84,42,42].json | 1 + ...8,84,83,83]_[5,5,84,1]_[128,84,42,42].json | 1 + ...8,84,83,83]_[7,7,84,1]_[128,84,42,42].json | 1 + ...96,165,165]_[5,5,96,1]_[128,96,83,83].json | 1 + ...96,165,165]_[7,7,96,1]_[128,96,83,83].json | 1 + ...Dot_[128,4032]_[4032,1000]_[128,1000].json | 1 + ...oller_Sum_[128,4032,11,11]_[128,4032].json | 1 + .../nnfusion/kernel_db/roller_res/convert.sh | 26 ++ ...14,14]_[2048,1024,1,1]_[128,2048,7,7].json | 1 + ...14,14]_[256,1024,1,1]_[128,256,14,14].json | 1 + ...14,14]_[512,1024,1,1]_[128,512,14,14].json | 1 + ...,28,28]_[128,128,3,3]_[128,128,28,28].json | 1 + ...,28,28]_[512,128,1,1]_[128,512,28,28].json | 1 + ...,58,58]_[128,128,3,3]_[128,128,28,28].json | 1 + ...048,7,7]_[512,2048,1,1]_[128,512,7,7].json | 1 + ...4,14]_[1024,256,1,1]_[128,1024,14,14].json | 1 + ...,14,14]_[256,256,3,3]_[128,256,14,14].json | 1 + ...,30,30]_[256,256,3,3]_[128,256,14,14].json | 1 + ...,56,56]_[128,256,1,1]_[128,128,28,28].json | 1 + ...,56,56]_[512,256,1,1]_[128,512,28,28].json | 1 + ...56,56,56]_[64,256,1,1]_[128,64,56,56].json | 1 + ...,230,230]_[64,3,7,7]_[128,64,112,112].json | 1 + ...12,16,16]_[512,512,3,3]_[128,512,7,7].json | 1 + ...8,28]_[1024,512,1,1]_[128,1024,14,14].json | 1 + ...,28,28]_[128,512,1,1]_[128,128,28,28].json | 1 + ...,28,28]_[256,512,1,1]_[128,256,28,28].json | 1 + ...12,7,7]_[2048,512,1,1]_[128,2048,7,7].json | 1 + ...,512,7,7]_[512,512,3,3]_[128,512,7,7].json | 1 + ...4,56,56]_[256,64,1,1]_[128,256,56,56].json | 1 + ...,64,56,56]_[64,64,1,1]_[128,64,56,56].json | 1 + ...,64,56,56]_[64,64,3,3]_[128,64,56,56].json | 1 + ...Dot_[128,2048]_[2048,1000]_[128,1000].json | 1 + ...xPool_[128,64,112,112]_[128,64,56,56].json | 1 + .../roller_Sum_[128,2048,7,7]_[128,2048].json | 1 + 115 files changed, 1421 insertions(+) create mode 100644 src/tools/nnfusion/kernel_db/example.json create mode 100644 src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_[128,168,42,42]_[168,168,1,1]_[128,168,42,42].cu create mode 100644 src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_[128,336,21,21]_[336,336,1,1]_[128,336,21,21].cu create mode 100644 src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_[128,672,11,11]_[672,672,1,1]_[128,672,11,11].cu create mode 100644 src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_[128,84,42,42]_[84,84,1,1]_[128,84,42,42].cu create mode 100755 src/tools/nnfusion/kernel_db/parse_bert.sh create mode 100755 src/tools/nnfusion/kernel_db/parse_lstm.sh create mode 100755 src/tools/nnfusion/kernel_db/parse_nas.sh create mode 100755 src/tools/nnfusion/kernel_db/parse_res.sh create mode 100644 src/tools/nnfusion/kernel_db/parser.out create mode 100644 src/tools/nnfusion/kernel_db/parsetab.py create mode 100755 src/tools/nnfusion/kernel_db/roller_bert/convert.sh create mode 100755 src/tools/nnfusion/kernel_db/roller_bert/convert_ansor.sh create mode 100644 src/tools/nnfusion/kernel_db/roller_bert/roller_BatchMatMul_[128,16,512,512]_[128,16,512,64]_[128,16,512,64].json create mode 100644 src/tools/nnfusion/kernel_db/roller_bert/roller_BatchMatMul_[128,16,512,64]_[128,16,512,64]_[128,16,512,512].json create mode 100644 src/tools/nnfusion/kernel_db/roller_bert/roller_Broadcast_[128,512,512]_[128,16,512,512].json create mode 100644 src/tools/nnfusion/kernel_db/roller_bert/roller_Broadcast_[128,512]_[128,512,512].json create mode 100644 src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,1024]_[1024,1024]_[65536,1024].json create mode 100644 src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,1024]_[1024,4096]_[65536,1024].json create mode 100644 src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,2]_[2,1024]_[65536,1024].json create mode 100644 src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,30522]_[30522,1024]_[65536,1024].json create mode 100644 src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,4096]_[4096,1024]_[65536,1024].json create mode 100644 src/tools/nnfusion/kernel_db/roller_bert/roller_Sum_[128,512,1024]_[128,512].json create mode 100644 src/tools/nnfusion/kernel_db/roller_bert/roller_Sum_[65536,1024]_[65536].json create mode 100644 src/tools/nnfusion/kernel_db/roller_lstm/roller_Dot_[128,256]_[256,256]_[256,256].json create mode 100755 src/tools/nnfusion/kernel_db/roller_nas/convert.sh create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,1008,42,42]_[128,1008,21,21].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,168,42,42]_[128,168,42,42].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,168,83,83]_[128,168,42,42].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,2016,21,21]_[128,2016,11,11].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,336,21,21]_[128,336,21,21].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,336,42,42]_[128,336,21,21].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,42,165,165]_[128,42,83,83].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,42,83,83]_[128,42,83,83].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,672,11,11]_[128,672,11,11].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,672,21,21]_[128,672,11,11].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,84,42,42]_[128,84,42,42].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,84,83,83]_[128,84,42,42].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,96,165,165]_[128,96,83,83].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,1008,21,21]_[168,1008,1,1]_[128,168,21,21].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,1008,42,42]_[168,1008,1,1]_[128,168,42,42].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,1008,42,42]_[336,1008,1,1]_[128,336,42,42].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,1344,21,21]_[336,1344,1,1]_[128,336,21,21].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,168,42,42]_[168,168,1,1]_[128,168,42,42]_bias.json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,168,42,42]_[168,168,1,1]_[128,168,42,42]_relu.json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,168,42,42]_[84,168,1,1]_[128,84,42,42].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,168,83,83]_[84,168,1,1]_[128,84,83,83].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,2016,11,11]_[336,2016,1,1]_[128,336,11,11].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,2016,21,21]_[336,2016,1,1]_[128,336,21,21].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,2016,21,21]_[672,2016,1,1]_[128,672,21,21].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,2688,11,11]_[672,2688,1,1]_[128,672,11,11].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,3,331,331]_[96,3,3,3]_[128,96,165,165]_relu.json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,336,21,21]_[336,336,1,1]_[128,336,21,21]_bias.json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,336,21,21]_[336,336,1,1]_[128,336,21,21]_relu.json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,336,42,42]_[168,336,1,1]_[128,168,42,42].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,4032,11,11]_[672,4032,1,1]_[128,672,11,11].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,42,83,83]_[42,42,1,1]_[128,42,83,83].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,672,11,11]_[672,672,1,1]_[128,672,11,11]_bias.json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,672,11,11]_[672,672,1,1]_[128,672,11,11]_relu.json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,84,42,42]_[84,84,1,1]_[128,84,42,42]_bias.json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,84,42,42]_[84,84,1,1]_[128,84,42,42]_relu.json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,96,165,165]_[42,96,1,1]_[128,42,165,165].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,96,83,83]_[42,96,1,1]_[128,42,83,83].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,168,42,42]_[3,3,168,1]_[128,168,42,42].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,168,42,42]_[5,5,168,1]_[128,168,42,42].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,21,21]_[3,3,336,1]_[128,336,21,21].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,21,21]_[5,5,336,1]_[128,336,21,21].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,21,21]_[7,7,336,1]_[128,336,21,21].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,45,45]_[5,5,336,1]_[128,336,21,21].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,47,47]_[7,7,336,1]_[128,336,21,21].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,42,165,165]_[5,5,42,1]_[128,42,83,83].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,42,83,83]_[3,3,42,1]_[128,42,83,83].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,42,83,83]_[5,5,42,1]_[128,42,83,83].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,42,83,83]_[7,7,42,1]_[128,42,83,83].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,11,11]_[3,3,672,1]_[128,672,11,11].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,11,11]_[5,5,672,1]_[128,672,11,11].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,11,11]_[7,7,672,1]_[128,672,11,11].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,21,21]_[5,5,672,1]_[128,672,11,11].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,21,21]_[7,7,672,1]_[128,672,11,11].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,42,42]_[3,3,84,1]_[128,84,42,42].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,42,42]_[5,5,84,1]_[128,84,42,42].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,42,42]_[7,7,84,1]_[128,84,42,42].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,83,83]_[5,5,84,1]_[128,84,42,42].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,83,83]_[7,7,84,1]_[128,84,42,42].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,96,165,165]_[5,5,96,1]_[128,96,83,83].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,96,165,165]_[7,7,96,1]_[128,96,83,83].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Dot_[128,4032]_[4032,1000]_[128,1000].json create mode 100644 src/tools/nnfusion/kernel_db/roller_nas/roller_Sum_[128,4032,11,11]_[128,4032].json create mode 100755 src/tools/nnfusion/kernel_db/roller_res/convert.sh create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,1024,14,14]_[2048,1024,1,1]_[128,2048,7,7].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,1024,14,14]_[256,1024,1,1]_[128,256,14,14].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,1024,14,14]_[512,1024,1,1]_[128,512,14,14].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,128,28,28]_[128,128,3,3]_[128,128,28,28].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,128,28,28]_[512,128,1,1]_[128,512,28,28].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,128,58,58]_[128,128,3,3]_[128,128,28,28].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,2048,7,7]_[512,2048,1,1]_[128,512,7,7].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,14,14]_[1024,256,1,1]_[128,1024,14,14].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,14,14]_[256,256,3,3]_[128,256,14,14].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,30,30]_[256,256,3,3]_[128,256,14,14].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,56,56]_[128,256,1,1]_[128,128,28,28].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,56,56]_[512,256,1,1]_[128,512,28,28].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,56,56]_[64,256,1,1]_[128,64,56,56].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,3,230,230]_[64,3,7,7]_[128,64,112,112].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,16,16]_[512,512,3,3]_[128,512,7,7].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,28,28]_[1024,512,1,1]_[128,1024,14,14].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,28,28]_[128,512,1,1]_[128,128,28,28].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,28,28]_[256,512,1,1]_[128,256,28,28].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,7,7]_[2048,512,1,1]_[128,2048,7,7].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,7,7]_[512,512,3,3]_[128,512,7,7].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,64,56,56]_[256,64,1,1]_[128,256,56,56].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,64,56,56]_[64,64,1,1]_[128,64,56,56].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,64,56,56]_[64,64,3,3]_[128,64,56,56].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Dot_[128,2048]_[2048,1000]_[128,1000].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_MaxPool_[128,64,112,112]_[128,64,56,56].json create mode 100644 src/tools/nnfusion/kernel_db/roller_res/roller_Sum_[128,2048,7,7]_[128,2048].json diff --git a/src/tools/nnfusion/kernel_db/example.json b/src/tools/nnfusion/kernel_db/example.json new file mode 100644 index 000000000..9dbe13697 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/example.json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 168, 42, 42], "filter_shape": [84, 168, 1, 1], "output_shape": [128, 84, 43, 43], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Convolution", "tvm_func_name": "implicit-gemm-128-168-42-42-84-1-1-1-SAME-0-1323-256", "code": "extern \"C\" __global__ void implicit-gemm-128-168-42-42-84-1-1-1-SAME-0-1323-256(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ output) {\n float conv2d_nchw_implicit_gemm_local[64];\n __shared__ float data_2d_shared[4096];\n __shared__ float kernel_2d_shared[256];\n float data_2d_shared_local[4];\n float kernel_2d_shared_local[16];\n for (int vthread_s = 0; vthread_s < 16; ++vthread_s) {\n conv2d_nchw_implicit_gemm_local[((vthread_s * 4))] = 0.000000e+00f;\n }\n for (int vthread_s1 = 0; vthread_s1 < 16; ++vthread_s1) {\n conv2d_nchw_implicit_gemm_local[(((vthread_s1 * 4) + 1))] = 0.000000e+00f;\n }\n for (int vthread_s2 = 0; vthread_s2 < 16; ++vthread_s2) {\n conv2d_nchw_implicit_gemm_local[(((vthread_s2 * 4) + 2))] = 0.000000e+00f;\n }\n for (int vthread_s3 = 0; vthread_s3 < 16; ++vthread_s3) {\n conv2d_nchw_implicit_gemm_local[(((vthread_s3 * 4) + 3))] = 0.000000e+00f;\n }\n for (int k_outer = 0; k_outer < 21; ++k_outer) {\n __syncthreads();\n data_2d_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) / 1764) * 296352) + (k_outer * 14112)) + ((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) % 1764)))];\n data_2d_shared[((((int)threadIdx.x) + 256))] = data[(((((((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) + 256) / 1764) * 296352) + (k_outer * 14112)) + (((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) + 256) % 1764)))];\n data_2d_shared[((((int)threadIdx.x) + 512))] = data[(((((((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) / 1764) * 296352) + (k_outer * 14112)) + ((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) % 1764)) + 1764))];\n data_2d_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((int)blockIdx.x) % 441) * 512) + (((int)threadIdx.x) + 256)) / 1764) * 296352) + (k_outer * 14112)) + (((((int)threadIdx.x) + 768) >> 9) * 1764)) + ((((((int)blockIdx.x) % 441) * 512) + (((int)threadIdx.x) + 256)) % 1764)))];\n data_2d_shared[((((int)threadIdx.x) + 1024))] = data[(((((((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) / 1764) * 296352) + (k_outer * 14112)) + ((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) % 1764)) + 3528))];\n data_2d_shared[((((int)threadIdx.x) + 1280))] = data[(((((((((((int)blockIdx.x) % 441) * 512) + (((int)threadIdx.x) + 256)) / 1764) * 296352) + (k_outer * 14112)) + (((((int)threadIdx.x) + 1280) >> 9) * 1764)) + ((((((int)blockIdx.x) % 441) * 512) + (((int)threadIdx.x) + 256)) % 1764)))];\n data_2d_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) / 1764) * 296352) + (k_outer * 14112)) + ((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) % 1764)) + 5292))];\n data_2d_shared[((((int)threadIdx.x) + 1792))] = data[(((((((((((int)blockIdx.x) % 441) * 512) + (((int)threadIdx.x) + 256)) / 1764) * 296352) + (k_outer * 14112)) + (((((int)threadIdx.x) + 1792) >> 9) * 1764)) + ((((((int)blockIdx.x) % 441) * 512) + (((int)threadIdx.x) + 256)) % 1764)))];\n data_2d_shared[((((int)threadIdx.x) + 2048))] = data[(((((((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) / 1764) * 296352) + (k_outer * 14112)) + ((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) % 1764)) + 7056))];\n data_2d_shared[((((int)threadIdx.x) + 2304))] = data[(((((((((((int)blockIdx.x) % 441) * 512) + (((int)threadIdx.x) + 256)) / 1764) * 296352) + (k_outer * 14112)) + (((((int)threadIdx.x) + 2304) >> 9) * 1764)) + ((((((int)blockIdx.x) % 441) * 512) + (((int)threadIdx.x) + 256)) % 1764)))];\n data_2d_shared[((((int)threadIdx.x) + 2560))] = data[(((((((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) / 1764) * 296352) + (k_outer * 14112)) + ((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) % 1764)) + 8820))];\n data_2d_shared[((((int)threadIdx.x) + 2816))] = data[(((((((((((int)blockIdx.x) % 441) * 512) + (((int)threadIdx.x) + 256)) / 1764) * 296352) + (k_outer * 14112)) + (((((int)threadIdx.x) + 2816) >> 9) * 1764)) + ((((((int)blockIdx.x) % 441) * 512) + (((int)threadIdx.x) + 256)) % 1764)))];\n data_2d_shared[((((int)threadIdx.x) + 3072))] = data[(((((((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) / 1764) * 296352) + (k_outer * 14112)) + ((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) % 1764)) + 10584))];\n data_2d_shared[((((int)threadIdx.x) + 3328))] = data[(((((((((((int)blockIdx.x) % 441) * 512) + (((int)threadIdx.x) + 256)) / 1764) * 296352) + (k_outer * 14112)) + (((((int)threadIdx.x) + 3328) >> 9) * 1764)) + ((((((int)blockIdx.x) % 441) * 512) + (((int)threadIdx.x) + 256)) % 1764)))];\n data_2d_shared[((((int)threadIdx.x) + 3584))] = data[(((((((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) / 1764) * 296352) + (k_outer * 14112)) + ((((((int)blockIdx.x) % 441) * 512) + ((int)threadIdx.x)) % 1764)) + 12348))];\n data_2d_shared[((((int)threadIdx.x) + 3840))] = data[(((((((((((int)blockIdx.x) % 441) * 512) + (((int)threadIdx.x) + 256)) / 1764) * 296352) + (k_outer * 14112)) + (((((int)threadIdx.x) + 3840) >> 9) * 1764)) + ((((((int)blockIdx.x) % 441) * 512) + (((int)threadIdx.x) + 256)) % 1764)))];\n kernel_2d_shared[(((int)threadIdx.x))] = (((((((int)blockIdx.x) / 441) * 32) + (((int)threadIdx.x) >> 3)) < 84) ? kernel[((((((((int)blockIdx.x) / 441) * 5376) + ((((int)threadIdx.x) >> 3) * 168)) + (k_outer * 8)) + (((int)threadIdx.x) & 7)))] : 0.000000e+00f);\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 8; ++k_inner_outer) {\n data_2d_shared_local[(0)] = data_2d_shared[(((k_inner_outer * 512) + (((int)threadIdx.x) & 127)))];\n data_2d_shared_local[(1)] = data_2d_shared[((((k_inner_outer * 512) + (((int)threadIdx.x) & 127)) + 128))];\n data_2d_shared_local[(2)] = data_2d_shared[((((k_inner_outer * 512) + (((int)threadIdx.x) & 127)) + 256))];\n data_2d_shared_local[(3)] = data_2d_shared[((((k_inner_outer * 512) + (((int)threadIdx.x) & 127)) + 384))];\n for (int vthread_s4 = 0; vthread_s4 < 16; ++vthread_s4) {\n kernel_2d_shared_local[(vthread_s4)] = kernel_2d_shared[((((vthread_s4 * 16) + ((((int)threadIdx.x) >> 7) * 8)) + k_inner_outer))];\n }\n for (int vthread_s5 = 0; vthread_s5 < 16; ++vthread_s5) {\n conv2d_nchw_implicit_gemm_local[((vthread_s5 * 4))] = (conv2d_nchw_implicit_gemm_local[((vthread_s5 * 4))] + (data_2d_shared_local[(0)] * kernel_2d_shared_local[(vthread_s5)]));\n }\n for (int vthread_s6 = 0; vthread_s6 < 16; ++vthread_s6) {\n conv2d_nchw_implicit_gemm_local[(((vthread_s6 * 4) + 1))] = (conv2d_nchw_implicit_gemm_local[(((vthread_s6 * 4) + 1))] + (data_2d_shared_local[(1)] * kernel_2d_shared_local[(vthread_s6)]));\n }\n for (int vthread_s7 = 0; vthread_s7 < 16; ++vthread_s7) {\n conv2d_nchw_implicit_gemm_local[(((vthread_s7 * 4) + 2))] = (conv2d_nchw_implicit_gemm_local[(((vthread_s7 * 4) + 2))] + (data_2d_shared_local[(2)] * kernel_2d_shared_local[(vthread_s7)]));\n }\n for (int vthread_s8 = 0; vthread_s8 < 16; ++vthread_s8) {\n conv2d_nchw_implicit_gemm_local[(((vthread_s8 * 4) + 3))] = (conv2d_nchw_implicit_gemm_local[(((vthread_s8 * 4) + 3))] + (data_2d_shared_local[(3)] * kernel_2d_shared_local[(vthread_s8)]));\n }\n }\n }\n for (int vthread_s9 = 0; vthread_s9 < 16; ++vthread_s9) {\n if (((((((int)blockIdx.x) / 441) * 32) + (vthread_s9 * 2)) + (((int)threadIdx.x) >> 7)) < 84) {\n output[(((((((((int)blockIdx.x) / 441) * 7225344) + (vthread_s9 * 451584)) + ((((int)threadIdx.x) >> 7) * 225792)) + ((((int)blockIdx.x) % 441) * 512)) + (((int)threadIdx.x) & 127)))] = conv2d_nchw_implicit_gemm_local[((vthread_s9 * 4))];\n output[((((((((((int)blockIdx.x) / 441) * 7225344) + (vthread_s9 * 451584)) + ((((int)threadIdx.x) >> 7) * 225792)) + ((((int)blockIdx.x) % 441) * 512)) + (((int)threadIdx.x) & 127)) + 128))] = conv2d_nchw_implicit_gemm_local[(((vthread_s9 * 4) + 1))];\n output[((((((((((int)blockIdx.x) / 441) * 7225344) + (vthread_s9 * 451584)) + ((((int)threadIdx.x) >> 7) * 225792)) + ((((int)blockIdx.x) % 441) * 512)) + (((int)threadIdx.x) & 127)) + 256))] = conv2d_nchw_implicit_gemm_local[(((vthread_s9 * 4) + 2))];\n output[((((((((((int)blockIdx.x) / 441) * 7225344) + (vthread_s9 * 451584)) + ((((int)threadIdx.x) >> 7) * 225792)) + ((((int)blockIdx.x) % 441) * 512)) + (((int)threadIdx.x) & 127)) + 384))] = conv2d_nchw_implicit_gemm_local[(((vthread_s9 * 4) + 3))];\n }\n }\n}\n", "gridDim": [1323, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_[128,168,42,42]_[168,168,1,1]_[128,168,42,42].cu b/src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_[128,168,42,42]_[168,168,1,1]_[128,168,42,42].cu new file mode 100644 index 000000000..bb470c594 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_[128,168,42,42]_[168,168,1,1]_[128,168,42,42].cu @@ -0,0 +1,265 @@ + +#ifdef _WIN32 + using uint = unsigned int; + using uchar = unsigned char; + using ushort = unsigned short; + using int64_t = long long; + using uint64_t = unsigned long long; +#else + #define uint unsigned int + #define uchar unsigned char + #define ushort unsigned short + #define int64_t long long + #define uint64_t unsigned long long +#endif +extern "C" __global__ void __launch_bounds__(384) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) { + float compute_local[64]; + __shared__ float compute_shared[4096]; + __shared__ float compute_d_shared[6144]; + float compute_shared_local[4]; + float compute_d_shared_local[16]; + compute_local[(0)] = 0.000000e+00f; + compute_local[(4)] = 0.000000e+00f; + compute_local[(8)] = 0.000000e+00f; + compute_local[(12)] = 0.000000e+00f; + compute_local[(16)] = 0.000000e+00f; + compute_local[(20)] = 0.000000e+00f; + compute_local[(24)] = 0.000000e+00f; + compute_local[(28)] = 0.000000e+00f; + compute_local[(32)] = 0.000000e+00f; + compute_local[(36)] = 0.000000e+00f; + compute_local[(40)] = 0.000000e+00f; + compute_local[(44)] = 0.000000e+00f; + compute_local[(48)] = 0.000000e+00f; + compute_local[(52)] = 0.000000e+00f; + compute_local[(56)] = 0.000000e+00f; + compute_local[(60)] = 0.000000e+00f; + compute_local[(1)] = 0.000000e+00f; + compute_local[(5)] = 0.000000e+00f; + compute_local[(9)] = 0.000000e+00f; + compute_local[(13)] = 0.000000e+00f; + compute_local[(17)] = 0.000000e+00f; + compute_local[(21)] = 0.000000e+00f; + compute_local[(25)] = 0.000000e+00f; + compute_local[(29)] = 0.000000e+00f; + compute_local[(33)] = 0.000000e+00f; + compute_local[(37)] = 0.000000e+00f; + compute_local[(41)] = 0.000000e+00f; + compute_local[(45)] = 0.000000e+00f; + compute_local[(49)] = 0.000000e+00f; + compute_local[(53)] = 0.000000e+00f; + compute_local[(57)] = 0.000000e+00f; + compute_local[(61)] = 0.000000e+00f; + compute_local[(2)] = 0.000000e+00f; + compute_local[(6)] = 0.000000e+00f; + compute_local[(10)] = 0.000000e+00f; + compute_local[(14)] = 0.000000e+00f; + compute_local[(18)] = 0.000000e+00f; + compute_local[(22)] = 0.000000e+00f; + compute_local[(26)] = 0.000000e+00f; + compute_local[(30)] = 0.000000e+00f; + compute_local[(34)] = 0.000000e+00f; + compute_local[(38)] = 0.000000e+00f; + compute_local[(42)] = 0.000000e+00f; + compute_local[(46)] = 0.000000e+00f; + compute_local[(50)] = 0.000000e+00f; + compute_local[(54)] = 0.000000e+00f; + compute_local[(58)] = 0.000000e+00f; + compute_local[(62)] = 0.000000e+00f; + compute_local[(3)] = 0.000000e+00f; + compute_local[(7)] = 0.000000e+00f; + compute_local[(11)] = 0.000000e+00f; + compute_local[(15)] = 0.000000e+00f; + compute_local[(19)] = 0.000000e+00f; + compute_local[(23)] = 0.000000e+00f; + compute_local[(27)] = 0.000000e+00f; + compute_local[(31)] = 0.000000e+00f; + compute_local[(35)] = 0.000000e+00f; + compute_local[(39)] = 0.000000e+00f; + compute_local[(43)] = 0.000000e+00f; + compute_local[(47)] = 0.000000e+00f; + compute_local[(51)] = 0.000000e+00f; + compute_local[(55)] = 0.000000e+00f; + compute_local[(59)] = 0.000000e+00f; + compute_local[(63)] = 0.000000e+00f; + for (int k_outer = 0; k_outer < 6; ++k_outer) { + __syncthreads(); + compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)))]; + compute_shared[((((int)threadIdx.x) + 384))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 5292))]; + compute_shared[((((int)threadIdx.x) + 768))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 162) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 10584))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 1152))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 159) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 15876))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 1536))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 156) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 21168))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 153) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 26460))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 150) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 31752))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 2688))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 147) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 37044))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 144) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 42336))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 3456))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 141) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 47628))] : 0.000000e+00f); + if (((int)threadIdx.x) < 256) { + compute_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 138) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 52920))] : 0.000000e+00f); + } + compute_d_shared[(((int)threadIdx.x))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[(((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 384))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2016))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 768))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4032))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 1152))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 6048))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 1536))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8064))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 10080))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12096))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 2688))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 14112))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16128))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 3456))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 18144))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 20160))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 4224))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 22176))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 4608))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24192))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 4992))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 26208))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 5376))] = 0.000000e+00f; + compute_d_shared[((((int)threadIdx.x) + 5760))] = 0.000000e+00f; + __syncthreads(); + for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) { + compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))]; + compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))]; + compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))]; + compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))]; + compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))]; + compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 384))]; + compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))]; + compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1152))]; + compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))]; + compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1920))]; + compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2304))]; + compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2688))]; + compute_d_shared_local[(8)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3072))]; + compute_d_shared_local[(9)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3456))]; + compute_d_shared_local[(10)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3840))]; + compute_d_shared_local[(11)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 4224))]; + compute_d_shared_local[(12)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 4608))]; + compute_d_shared_local[(13)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 4992))]; + compute_d_shared_local[(14)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 5376))]; + compute_d_shared_local[(15)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 5760))]; + if (((k_outer * 32) + k_inner_outer) < 168) { + compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)])); + compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)])); + compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)])); + compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)])); + compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)])); + compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)])); + compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)])); + compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)])); + compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(8)])); + compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(0)] * compute_d_shared_local[(9)])); + compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(10)])); + compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(0)] * compute_d_shared_local[(11)])); + compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(12)])); + compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(0)] * compute_d_shared_local[(13)])); + compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(14)])); + compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(0)] * compute_d_shared_local[(15)])); + compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)])); + compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)])); + compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)])); + compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)])); + compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)])); + compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)])); + compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)])); + compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)])); + compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(8)])); + compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(1)] * compute_d_shared_local[(9)])); + compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(10)])); + compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(1)] * compute_d_shared_local[(11)])); + compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(12)])); + compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(1)] * compute_d_shared_local[(13)])); + compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(14)])); + compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(1)] * compute_d_shared_local[(15)])); + compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)])); + compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)])); + compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)])); + compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)])); + compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)])); + compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)])); + compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)])); + compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)])); + compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(8)])); + compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(2)] * compute_d_shared_local[(9)])); + compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(10)])); + compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(2)] * compute_d_shared_local[(11)])); + compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(12)])); + compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(2)] * compute_d_shared_local[(13)])); + compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(14)])); + compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(2)] * compute_d_shared_local[(15)])); + compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)])); + compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)])); + compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)])); + compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)])); + compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)])); + compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)])); + compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)])); + compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)])); + compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(8)])); + compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(3)] * compute_d_shared_local[(9)])); + compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(10)])); + compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(3)] * compute_d_shared_local[(11)])); + compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(12)])); + compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(3)] * compute_d_shared_local[(13)])); + compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(14)])); + compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(3)] * compute_d_shared_local[(15)])); + } + } + } + compute[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))] = (compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))] = (compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))] = (compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))] = (compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419008))] = (compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419008))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419040))] = (compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419040))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419072))] = (compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419072))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419104))] = (compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419104))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128512))] = (compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128512))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128544))] = (compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128544))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128576))] = (compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128576))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128608))] = (compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128608))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838016))] = (compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838016))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838048))] = (compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838048))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838080))] = (compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838080))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838112))] = (compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838112))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547520))] = (compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547520))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547552))] = (compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547552))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547584))] = (compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547584))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547616))] = (compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547616))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257024))] = (compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257024))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257056))] = (compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257056))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257088))] = (compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257088))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257120))] = (compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257120))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966528))] = (compute_local[(28)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966528))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966560))] = (compute_local[(29)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966560))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966592))] = (compute_local[(30)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966592))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966624))] = (compute_local[(31)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966624))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676032))] = (compute_local[(32)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676032))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676064))] = (compute_local[(33)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676064))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676096))] = (compute_local[(34)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676096))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676128))] = (compute_local[(35)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676128))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385536))] = (compute_local[(36)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385536))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385568))] = (compute_local[(37)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385568))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385600))] = (compute_local[(38)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385600))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385632))] = (compute_local[(39)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385632))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095040))] = (compute_local[(40)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095040))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095072))] = (compute_local[(41)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095072))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095104))] = (compute_local[(42)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095104))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095136))] = (compute_local[(43)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095136))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804544))] = (compute_local[(44)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804544))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804576))] = (compute_local[(45)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804576))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804608))] = (compute_local[(46)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804608))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804640))] = (compute_local[(47)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804640))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514048))] = (compute_local[(48)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514048))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514080))] = (compute_local[(49)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514080))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514112))] = (compute_local[(50)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514112))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514144))] = (compute_local[(51)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514144))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223552))] = (compute_local[(52)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223552))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223584))] = (compute_local[(53)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223584))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223616))] = (compute_local[(54)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223616))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223648))] = (compute_local[(55)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223648))]); +} + +dim3 grid(1764, 1, 1); +dim3 block(384, 1, 1); diff --git a/src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_[128,336,21,21]_[336,336,1,1]_[128,336,21,21].cu b/src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_[128,336,21,21]_[336,336,1,1]_[128,336,21,21].cu new file mode 100644 index 000000000..d026b0896 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_[128,336,21,21]_[336,336,1,1]_[128,336,21,21].cu @@ -0,0 +1,176 @@ + +#ifdef _WIN32 + using uint = unsigned int; + using uchar = unsigned char; + using ushort = unsigned short; + using int64_t = long long; + using uint64_t = unsigned long long; +#else + #define uint unsigned int + #define uchar unsigned char + #define ushort unsigned short + #define int64_t long long + #define uint64_t unsigned long long +#endif +extern "C" __global__ void __launch_bounds__(256) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) { + float compute_local[32]; + __shared__ float compute_shared[4096]; + __shared__ float compute_d_shared[2048]; + float compute_shared_local[4]; + float compute_d_shared_local[8]; + compute_local[(0)] = 0.000000e+00f; + compute_local[(4)] = 0.000000e+00f; + compute_local[(8)] = 0.000000e+00f; + compute_local[(12)] = 0.000000e+00f; + compute_local[(16)] = 0.000000e+00f; + compute_local[(20)] = 0.000000e+00f; + compute_local[(24)] = 0.000000e+00f; + compute_local[(28)] = 0.000000e+00f; + compute_local[(1)] = 0.000000e+00f; + compute_local[(5)] = 0.000000e+00f; + compute_local[(9)] = 0.000000e+00f; + compute_local[(13)] = 0.000000e+00f; + compute_local[(17)] = 0.000000e+00f; + compute_local[(21)] = 0.000000e+00f; + compute_local[(25)] = 0.000000e+00f; + compute_local[(29)] = 0.000000e+00f; + compute_local[(2)] = 0.000000e+00f; + compute_local[(6)] = 0.000000e+00f; + compute_local[(10)] = 0.000000e+00f; + compute_local[(14)] = 0.000000e+00f; + compute_local[(18)] = 0.000000e+00f; + compute_local[(22)] = 0.000000e+00f; + compute_local[(26)] = 0.000000e+00f; + compute_local[(30)] = 0.000000e+00f; + compute_local[(3)] = 0.000000e+00f; + compute_local[(7)] = 0.000000e+00f; + compute_local[(11)] = 0.000000e+00f; + compute_local[(15)] = 0.000000e+00f; + compute_local[(19)] = 0.000000e+00f; + compute_local[(23)] = 0.000000e+00f; + compute_local[(27)] = 0.000000e+00f; + compute_local[(31)] = 0.000000e+00f; + for (int k_outer = 0; k_outer < 11; ++k_outer) { + __syncthreads(); + compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)))]; + compute_shared[((((int)threadIdx.x) + 256))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 882))]; + compute_shared[((((int)threadIdx.x) + 512))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 1764))]; + compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 2646))]; + compute_shared[((((int)threadIdx.x) + 1024))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 3528))]; + compute_shared[((((int)threadIdx.x) + 1280))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 4410))]; + compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 5292))]; + compute_shared[((((int)threadIdx.x) + 1792))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 6174))]; + compute_shared[((((int)threadIdx.x) + 2048))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 320) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 7056))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 318) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 7938))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 2560))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 316) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 8820))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 2816))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 314) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 9702))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 312) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 10584))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 3328))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 310) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 11466))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 3584))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 308) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 12348))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 306) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 13230))] : 0.000000e+00f); + compute_d_shared[(((int)threadIdx.x))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 256))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2688))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 512))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 320) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 5376))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 768))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 312) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8064))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 1024))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 304) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 10752))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 1280))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 296) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 13440))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 1536))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 288) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16128))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 1792))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 280) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 18816))] : 0.000000e+00f); + __syncthreads(); + for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) { + compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))]; + compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))]; + compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))]; + compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))]; + compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))]; + compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 256))]; + compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 512))]; + compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))]; + compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1024))]; + compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1280))]; + compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))]; + compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1792))]; + if (((k_outer * 32) + k_inner_outer) < 336) { + compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)])); + compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)])); + compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)])); + compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)])); + compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)])); + compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)])); + compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)])); + compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)])); + compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)])); + compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)])); + compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)])); + compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)])); + compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)])); + compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)])); + compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)])); + compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)])); + compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)])); + compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)])); + compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)])); + compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)])); + compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)])); + compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)])); + compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)])); + compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)])); + compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)])); + compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)])); + compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)])); + compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)])); + compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)])); + compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)])); + compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)])); + compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)])); + } + } + } + compute[((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 32))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 64))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 96))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451584))] = (compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451584))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451616))] = (compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451616))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451648))] = (compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451648))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451680))] = (compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451680))]); + if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 320) { + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903168))] = (compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903168))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903200))] = (compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903200))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903232))] = (compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903232))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903264))] = (compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903264))]); + } + if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 312) { + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354752))] = (compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354752))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354784))] = (compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354784))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354816))] = (compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354816))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354848))] = (compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354848))]); + } + if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 304) { + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806336))] = (compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806336))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806368))] = (compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806368))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806400))] = (compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806400))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806432))] = (compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806432))]); + } + if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 296) { + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257920))] = (compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257920))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257952))] = (compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257952))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257984))] = (compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257984))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2258016))] = (compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2258016))]); + } + if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 288) { + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))] = (compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))] = (compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))] = (compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))] = (compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))]); + } + if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 280) { + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161088))] = (compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161088))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161120))] = (compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161120))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161152))] = (compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161152))]); + compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161184))] = (compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161184))]); + } +} + +dim3 grid(2646, 1, 1); +dim3 block(256, 1, 1); diff --git a/src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_[128,672,11,11]_[672,672,1,1]_[128,672,11,11].cu b/src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_[128,672,11,11]_[672,672,1,1]_[128,672,11,11].cu new file mode 100644 index 000000000..40e4b3313 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_[128,672,11,11]_[672,672,1,1]_[128,672,11,11].cu @@ -0,0 +1,159 @@ + +#ifdef _WIN32 + using uint = unsigned int; + using uchar = unsigned char; + using ushort = unsigned short; + using int64_t = long long; + using uint64_t = unsigned long long; +#else + #define uint unsigned int + #define uchar unsigned char + #define ushort unsigned short + #define int64_t long long + #define uint64_t unsigned long long +#endif +extern "C" __global__ void __launch_bounds__(384) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) { + float compute_local[32]; + __shared__ float compute_shared[4096]; + __shared__ float compute_d_shared[3072]; + float compute_shared_local[4]; + float compute_d_shared_local[8]; + compute_local[(0)] = 0.000000e+00f; + compute_local[(4)] = 0.000000e+00f; + compute_local[(8)] = 0.000000e+00f; + compute_local[(12)] = 0.000000e+00f; + compute_local[(16)] = 0.000000e+00f; + compute_local[(20)] = 0.000000e+00f; + compute_local[(24)] = 0.000000e+00f; + compute_local[(28)] = 0.000000e+00f; + compute_local[(1)] = 0.000000e+00f; + compute_local[(5)] = 0.000000e+00f; + compute_local[(9)] = 0.000000e+00f; + compute_local[(13)] = 0.000000e+00f; + compute_local[(17)] = 0.000000e+00f; + compute_local[(21)] = 0.000000e+00f; + compute_local[(25)] = 0.000000e+00f; + compute_local[(29)] = 0.000000e+00f; + compute_local[(2)] = 0.000000e+00f; + compute_local[(6)] = 0.000000e+00f; + compute_local[(10)] = 0.000000e+00f; + compute_local[(14)] = 0.000000e+00f; + compute_local[(18)] = 0.000000e+00f; + compute_local[(22)] = 0.000000e+00f; + compute_local[(26)] = 0.000000e+00f; + compute_local[(30)] = 0.000000e+00f; + compute_local[(3)] = 0.000000e+00f; + compute_local[(7)] = 0.000000e+00f; + compute_local[(11)] = 0.000000e+00f; + compute_local[(15)] = 0.000000e+00f; + compute_local[(19)] = 0.000000e+00f; + compute_local[(23)] = 0.000000e+00f; + compute_local[(27)] = 0.000000e+00f; + compute_local[(31)] = 0.000000e+00f; + for (int k_outer = 0; k_outer < 21; ++k_outer) { + __syncthreads(); + compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)))]; + compute_shared[((((int)threadIdx.x) + 384))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 363))]; + compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 726))]; + compute_shared[((((int)threadIdx.x) + 1152))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 1089))]; + compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 1452))]; + compute_shared[((((int)threadIdx.x) + 1920))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 1815))]; + compute_shared[((((int)threadIdx.x) + 2304))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 2178))]; + compute_shared[((((int)threadIdx.x) + 2688))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 2541))]; + compute_shared[((((int)threadIdx.x) + 3072))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 2904))]; + compute_shared[((((int)threadIdx.x) + 3456))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 3267))]; + if (((int)threadIdx.x) < 256) { + compute_shared[((((int)threadIdx.x) + 3840))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 3630))]; + } + compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))]; + compute_d_shared[((((int)threadIdx.x) + 384))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8064))]; + compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16128))]; + compute_d_shared[((((int)threadIdx.x) + 1152))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24192))]; + compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32256))]; + compute_d_shared[((((int)threadIdx.x) + 1920))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 40320))]; + compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 48384))]; + compute_d_shared[((((int)threadIdx.x) + 2688))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 56448))]; + __syncthreads(); + for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) { + compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))]; + compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))]; + compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))]; + compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))]; + compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))]; + compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 384))]; + compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))]; + compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1152))]; + compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))]; + compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1920))]; + compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2304))]; + compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2688))]; + compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)])); + compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)])); + compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)])); + compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)])); + compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)])); + compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)])); + compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)])); + compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)])); + compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)])); + compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)])); + compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)])); + compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)])); + compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)])); + compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)])); + compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)])); + compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)])); + compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)])); + compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)])); + compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)])); + compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)])); + compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)])); + compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)])); + compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)])); + compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)])); + compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)])); + compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)])); + compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)])); + compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)])); + compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)])); + compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)])); + compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)])); + compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)])); + } + } + compute[((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185856))] = (compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185856))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371712))] = (compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371712))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557568))] = (compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557568))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743424))] = (compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743424))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929280))] = (compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929280))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115136))] = (compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115136))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1300992))] = (compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1300992))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 32))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185888))] = (compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185888))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371744))] = (compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371744))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557600))] = (compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557600))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743456))] = (compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743456))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929312))] = (compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929312))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115168))] = (compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115168))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301024))] = (compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301024))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 64))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185920))] = (compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185920))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371776))] = (compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371776))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557632))] = (compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557632))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743488))] = (compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743488))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929344))] = (compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929344))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115200))] = (compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115200))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301056))] = (compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301056))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 96))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185952))] = (compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185952))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371808))] = (compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371808))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557664))] = (compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557664))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743520))] = (compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743520))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929376))] = (compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929376))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115232))] = (compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115232))]); + compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301088))] = (compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301088))]); +} + +dim3 grid(847, 1, 1); +dim3 block(384, 1, 1); diff --git a/src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_[128,84,42,42]_[84,84,1,1]_[128,84,42,42].cu b/src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_[128,84,42,42]_[84,84,1,1]_[128,84,42,42].cu new file mode 100644 index 000000000..eafdb2b7b --- /dev/null +++ b/src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_[128,84,42,42]_[84,84,1,1]_[128,84,42,42].cu @@ -0,0 +1,157 @@ + +#ifdef _WIN32 + using uint = unsigned int; + using uchar = unsigned char; + using ushort = unsigned short; + using int64_t = long long; + using uint64_t = unsigned long long; +#else + #define uint unsigned int + #define uchar unsigned char + #define ushort unsigned short + #define int64_t long long + #define uint64_t unsigned long long +#endif +extern "C" __global__ void __launch_bounds__(384) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) { + float compute_local[32]; + __shared__ float compute_shared[4096]; + __shared__ float compute_d_shared[3072]; + float compute_shared_local[4]; + float compute_d_shared_local[8]; + compute_local[(0)] = 0.000000e+00f; + compute_local[(4)] = 0.000000e+00f; + compute_local[(8)] = 0.000000e+00f; + compute_local[(12)] = 0.000000e+00f; + compute_local[(16)] = 0.000000e+00f; + compute_local[(20)] = 0.000000e+00f; + compute_local[(24)] = 0.000000e+00f; + compute_local[(28)] = 0.000000e+00f; + compute_local[(1)] = 0.000000e+00f; + compute_local[(5)] = 0.000000e+00f; + compute_local[(9)] = 0.000000e+00f; + compute_local[(13)] = 0.000000e+00f; + compute_local[(17)] = 0.000000e+00f; + compute_local[(21)] = 0.000000e+00f; + compute_local[(25)] = 0.000000e+00f; + compute_local[(29)] = 0.000000e+00f; + compute_local[(2)] = 0.000000e+00f; + compute_local[(6)] = 0.000000e+00f; + compute_local[(10)] = 0.000000e+00f; + compute_local[(14)] = 0.000000e+00f; + compute_local[(18)] = 0.000000e+00f; + compute_local[(22)] = 0.000000e+00f; + compute_local[(26)] = 0.000000e+00f; + compute_local[(30)] = 0.000000e+00f; + compute_local[(3)] = 0.000000e+00f; + compute_local[(7)] = 0.000000e+00f; + compute_local[(11)] = 0.000000e+00f; + compute_local[(15)] = 0.000000e+00f; + compute_local[(19)] = 0.000000e+00f; + compute_local[(23)] = 0.000000e+00f; + compute_local[(27)] = 0.000000e+00f; + compute_local[(31)] = 0.000000e+00f; + for (int k_outer = 0; k_outer < 3; ++k_outer) { + __syncthreads(); + compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)))]; + compute_shared[((((int)threadIdx.x) + 384))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 5292))]; + compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 10584))]; + compute_shared[((((int)threadIdx.x) + 1152))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 15876))]; + compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 21168))]; + compute_shared[((((int)threadIdx.x) + 1920))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 26460))]; + compute_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 66) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 31752))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 2688))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 63) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 37044))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 60) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 42336))] : 0.000000e+00f); + compute_shared[((((int)threadIdx.x) + 3456))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 57) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 47628))] : 0.000000e+00f); + if (((int)threadIdx.x) < 256) { + compute_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 54) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 52920))] : 0.000000e+00f); + } + compute_d_shared[(((int)threadIdx.x))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[(((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 384))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1008))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 768))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2016))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 1152))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 3024))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 1536))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4032))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 5040))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 6048))] : 0.000000e+00f); + compute_d_shared[((((int)threadIdx.x) + 2688))] = 0.000000e+00f; + __syncthreads(); + for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) { + compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))]; + compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))]; + compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))]; + compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))]; + compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))]; + compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 384))]; + compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))]; + compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1152))]; + compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))]; + compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1920))]; + compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2304))]; + compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2688))]; + if (((k_outer * 32) + k_inner_outer) < 84) { + compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)])); + compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)])); + compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)])); + compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)])); + compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)])); + compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)])); + compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)])); + compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)])); + compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)])); + compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)])); + compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)])); + compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)])); + compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)])); + compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)])); + compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)])); + compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)])); + compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)])); + compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)])); + compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)])); + compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)])); + compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)])); + compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)])); + compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)])); + compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)])); + compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)])); + compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)])); + compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)])); + compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)])); + compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)])); + compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)])); + compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)])); + compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)])); + } + } + } + compute[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))] = (compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))] = (compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))] = (compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))] = (compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419008))] = (compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419008))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419040))] = (compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419040))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419072))] = (compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419072))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419104))] = (compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419104))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128512))] = (compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128512))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128544))] = (compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128544))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128576))] = (compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128576))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128608))] = (compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128608))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838016))] = (compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838016))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838048))] = (compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838048))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838080))] = (compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838080))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838112))] = (compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838112))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547520))] = (compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547520))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547552))] = (compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547552))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547584))] = (compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547584))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547616))] = (compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547616))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257024))] = (compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257024))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257056))] = (compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257056))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257088))] = (compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257088))]); + compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257120))] = (compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257120))]); +} + +dim3 grid(1764, 1, 1); +dim3 block(384, 1, 1); diff --git a/src/tools/nnfusion/kernel_db/parse_bert.sh b/src/tools/nnfusion/kernel_db/parse_bert.sh new file mode 100755 index 000000000..946f75aca --- /dev/null +++ b/src/tools/nnfusion/kernel_db/parse_bert.sh @@ -0,0 +1,12 @@ +python parse_code.py --op_type BatchMatMul --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/bert_large_bs128/roller_BatchMatMul_\[128\,16\,512\,512\]_\[128\,16\,512\,64\]_\[128\,16\,512\,64\].cu --json_file roller_bert/roller_BatchMatMul_[128,16,512,512]_[128,16,512,64]_[128,16,512,64].json --input0_shape 128 16 512 512 --input1_shape 128 16 512 64 --output0_shape 128 16 512 64 +python parse_code.py --op_type BatchMatMul --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/bert_large_bs128/roller_BatchMatMul_\[128\,16\,512\,64\]_\[128\,16\,512\,64\]_\[128\,16\,512\,512\].cu --json_file roller_bert/roller_BatchMatMul_[128,16,512,64]_[128,16,512,64]_[128,16,512,512].json --input0_shape 128 16 512 64 --input1_shape 128 16 64 512 --output0_shape 128 16 512 512 +python parse_code.py --op_type Broadcast --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/bert_large_bs128/roller_Broadcast_\[128\,512\]_\[128\,512\,512\].cu --json_file roller_bert/roller_Broadcast_\[128\,512\]_\[128\,512\,512\].json --input0_shape 128 512 --output0_shape 128 512 512 --broadcast_axis 1 +python parse_code.py --op_type Broadcast --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/bert_large_bs128/roller_Broadcast_\[128\,512\,512\]_\[128\,16\,512\,512\].cu --json_file roller_bert/roller_Broadcast_\[128\,512\,512\]_\[128\,16\,512\,512\].json --input0_shape 128 512 512 --output0_shape 128 16 512 512 --broadcast_axis 1 +python parse_code.py --op_type Dot --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/bert_large_bs128/roller_Dot_\[65536\,1024\]_\[1024\,1024\]_\[65536\,1024\].cu --json_file roller_bert/roller_Dot_\[65536\,1024\]_\[1024\,1024\]_\[65536\,1024\].json --input0_shape 65536 1024 --input1_shape 1024 1024 --output0_shape 65536 1024 +python parse_code.py --op_type Dot --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/bert_large_bs128/roller_Dot_\[65536\,1024\]_\[1024\,4096\]_\[65536\,4096\].cu --json_file roller_bert/roller_Dot_\[65536\,1024\]_\[1024\,4096\]_\[65536\,1024\].json --input0_shape 65536 1024 --input1_shape 1024 4096 --output0_shape 65536 4096 +python parse_code.py --op_type Dot --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/bert_large_bs128/roller_Dot_\[65536\,2\]_\[2\,1024\]_\[65536\,1024\].cu --json_file roller_bert/roller_Dot_\[65536\,2\]_\[2\,1024\]_\[65536\,1024\].json --input0_shape 65536 2 --input1_shape 2 1024 --output0_shape 65536 1024 +python parse_code.py --op_type Dot --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/bert_large_bs128/roller_Dot_\[65536\,30522\]_\[30522\,1024\]_\[65536\,1024\].cu --json_file roller_bert/roller_Dot_\[65536\,30522\]_\[30522\,1024\]_\[65536\,1024\].json --input0_shape 65536 30522 --input1_shape 30522 1024 --output0_shape 65536 1024 +python parse_code.py --op_type Dot --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/bert_large_bs128/roller_Dot_\[65536\,4096\]_\[4096\,1024\]_\[65536\,1024\].cu --json_file roller_bert/roller_Dot_\[65536\,4096\]_\[4096\,1024\]_\[65536\,1024\].json --input0_shape 65536 4096 --input1_shape 4096 1024 --output0_shape 65536 1024 +python parse_code.py --op_type Sum --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/bert_large_bs128/roller_Sum_\[128\,512\,1024\]_\[128\,512\].cu --json_file roller_bert/roller_Sum_\[128\,512\,1024\]_\[128\,512\].json --input0_shape 128 512 1024 --output0_shape 128 512 --reduction_axis 2 +python parse_code.py --op_type Sum --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/bert_large_bs128/roller_Sum_\[65536\,1024\]_\[65536\].cu --json_file roller_bert/roller_Sum_\[65536\,1024\]_\[65536\].json --input0_shape 65536 1024 --output0_shape 65536 --reduction_axis 1 + diff --git a/src/tools/nnfusion/kernel_db/parse_lstm.sh b/src/tools/nnfusion/kernel_db/parse_lstm.sh new file mode 100755 index 000000000..af3ac8125 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/parse_lstm.sh @@ -0,0 +1 @@ +python parse_code.py --op_type Dot --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/lstm_bs128/roller_Dot_\[128\,256\]_\[256\,256\]_\[256\,256\].cu --json_file roller_lstm/roller_Dot_\[128\,256\]_\[256\,256\]_\[256\,256\].json --input0_shape 128 256 --input1_shape 256 256 --output0_shape 128 256 \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/parse_nas.sh b/src/tools/nnfusion/kernel_db/parse_nas.sh new file mode 100755 index 000000000..74e419170 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/parse_nas.sh @@ -0,0 +1,64 @@ +python parse_code.py --op_type AvgPool --input0_shape 128 42 83 83 --output0_shape 128 42 83 83 --window_shape 3 3 --stride 1 1 --padding 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_AvgPool_\[128\,42\,83\,83\]_\[128\,42\,83\,83\].cu --json_file roller_nas/roller_AvgPool_\[128\,42\,83\,83\]_\[128\,42\,83\,83\].json +python parse_code.py --op_type AvgPool --input0_shape 128 42 165 165 --output0_shape 128 42 83 83 --window_shape 3 3 --stride 2 2 --padding 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_AvgPool_\[128\,42\,165\,165\]_\[128\,42\,83\,83\].cu --json_file roller_nas/roller_AvgPool_\[128\,42\,165\,165\]_\[128\,42\,83\,83\].json +python parse_code.py --op_type AvgPool --input0_shape 128 96 165 165 --output0_shape 128 96 83 83 --window_shape 1 1 --stride 2 2 --padding 0 0 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_AvgPool_\[128\,96\,165\,165\]_\[128\,96\,83\,83\].cu --json_file roller_nas/roller_AvgPool_\[128\,96\,165\,165\]_\[128\,96\,83\,83\].json +python parse_code.py --op_type AvgPool --input0_shape 128 84 42 42 --output0_shape 128 84 42 42 --window_shape 3 3 --stride 1 1 --padding 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_AvgPool_\[128\,84\,42\,42\]_\[128\,84\,42\,42\].cu --json_file roller_nas/roller_AvgPool_\[128\,84\,42\,42\]_\[128\,84\,42\,42\].json +python parse_code.py --op_type AvgPool --input0_shape 128 84 83 83 --output0_shape 128 84 42 42 --window_shape 3 3 --stride 2 2 --padding 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_AvgPool_\[128\,84\,83\,83\]_\[128\,84\,42\,42\].cu --json_file roller_nas/roller_AvgPool_\[128\,84\,83\,83\]_\[128\,84\,42\,42\].json +python parse_code.py --op_type AvgPool --input0_shape 128 168 83 83 --output0_shape 128 168 42 42 --window_shape 1 1 --stride 2 2 --padding 0 0 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_AvgPool_\[128\,168\,83\,83\]_\[128\,168\,42\,42\].cu --json_file roller_nas/roller_AvgPool_\[128\,168\,83\,83\]_\[128\,168\,42\,42\].json +python parse_code.py --op_type AvgPool --input0_shape 128 168 42 42 --output0_shape 128 168 42 42 --window_shape 3 3 --stride 1 1 --padding 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_AvgPool_\[128\,168\,42\,42\]_\[128\,168\,42\,42\].cu --json_file roller_nas/roller_AvgPool_\[128\,168\,42\,42\]_\[128\,168\,42\,42\].json +python parse_code.py --op_type AvgPool --input0_shape 128 336 21 21 --output0_shape 128 336 21 21 --window_shape 3 3 --stride 1 1 --padding 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_AvgPool_\[128\,336\,21\,21\]_\[128\,336\,21\,21\].cu --json_file roller_nas/roller_AvgPool_\[128\,336\,21\,21\]_\[128\,336\,21\,21\].json +python parse_code.py --op_type AvgPool --input0_shape 128 336 42 42 --output0_shape 128 336 21 21 --window_shape 3 3 --stride 2 2 --padding 0 0 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_AvgPool_\[128\,336\,42\,42\]_\[128\,336\,21\,21\].cu --json_file roller_nas/roller_AvgPool_\[128\,336\,42\,42\]_\[128\,336\,21\,21\].json +python parse_code.py --op_type AvgPool --input0_shape 128 1008 42 42 --output0_shape 128 1008 21 21 --window_shape 1 1 --stride 2 2 --padding 0 0 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_AvgPool_\[128\,1008\,42\,42\]_\[128\,1008\,21\,21\].cu --json_file roller_nas/roller_AvgPool_\[128\,1008\,42\,42\]_\[128\,1008\,21\,21\].json +python parse_code.py --op_type AvgPool --input0_shape 128 672 11 11 --output0_shape 128 672 11 11 --window_shape 3 3 --stride 1 1 --padding 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_AvgPool_\[128\,672\,11\,11\]_\[128\,672\,11\,11\].cu --json_file roller_nas/roller_AvgPool_\[128\,672\,11\,11\]_\[128\,672\,11\,11\].json +python parse_code.py --op_type AvgPool --input0_shape 128 672 21 21 --output0_shape 128 672 11 11 --window_shape 3 3 --stride 2 2 --padding 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_AvgPool_\[128\,672\,21\,21\]_\[128\,672\,11\,11\].cu --json_file roller_nas/roller_AvgPool_\[128\,672\,21\,21\]_\[128\,672\,11\,11\].json +python parse_code.py --op_type AvgPool --input0_shape 128 2016 21 21 --output0_shape 128 2016 11 11 --window_shape 1 1 --stride 2 2 --padding 0 0 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_AvgPool_\[128\,2016\,21\,21\]_\[128\,2016\,11\,11\].cu --json_file roller_nas/roller_AvgPool_\[128\,2016\,21\,21\]_\[128\,2016\,11\,11\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 3 331 331 --input1_shape 96 3 3 3 --output0_shape 128 96 165 165 --stride 2 2 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,3\,331\,331\]_\[96\,3\,3\,3\]_\[128\,96\,165\,165\]_relu.cu --json_file roller_nas/roller_Convolution_\[128\,3\,331\,331\]_\[96\,3\,3\,3\]_\[128\,96\,165\,165\]_relu.json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 96 165 165 --input1_shape 42 96 1 1 --output0_shape 128 42 165 165 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,96\,165\,165\]_\[42\,96\,1\,1\]_\[128\,42\,165\,165\].cu --json_file roller_nas/roller_Convolution_\[128\,96\,165\,165\]_\[42\,96\,1\,1\]_\[128\,42\,165\,165\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 96 83 83 --input1_shape 42 96 1 1 --output0_shape 128 42 83 83 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,96\,83\,83\]_\[42\,96\,1\,1\]_\[128\,42\,83\,83\].cu --json_file roller_nas/roller_Convolution_\[128\,96\,83\,83\]_\[42\,96\,1\,1\]_\[128\,42\,83\,83\].json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 42 83 83 --input1_shape 42 42 1 1 --output0_shape 128 42 83 83 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,42\,83\,83\]_\[42\,42\,1\,1\]_\[128\,42\,83\,83\].cu --json_file roller_nas/roller_Convolution_\[128\,42\,83\,83\]_\[42\,42\,1\,1\]_\[128\,42\,83\,83\].json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 168 83 83 --input1_shape 84 168 1 1 --output0_shape 128 84 83 83 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,168\,83\,83\]_\[84\,168\,1\,1\]_\[128\,84\,83\,83\].cu --json_file roller_nas/roller_Convolution_\[128\,168\,83\,83\]_\[84\,168\,1\,1\]_\[128\,84\,83\,83\].json +python parse_code.py --op_type Convolution --input0_shape 128 96 83 83 --input1_shape 42 96 1 1 --output0_shape 128 42 83 83 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,96\,83\,83\]_\[42\,96\,1\,1\]_\[128\,42\,83\,83\].cu --json_file roller_nas/roller_Convolution_\[128\,96\,83\,83\]_\[42\,96\,1\,1\]_\[128\,42\,83\,83\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 84 42 42 --input1_shape 84 84 1 1 --output0_shape 128 84 42 42 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,84\,42\,42\]_\[84\,84\,1\,1\]_\[128\,84\,42\,42\]_relu.cu --json_file roller_nas/roller_Convolution_\[128\,84\,42\,42\]_\[84\,84\,1\,1\]_\[128\,84\,42\,42\]_relu.json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 336 42 42 --input1_shape 168 336 1 1 --output0_shape 128 168 42 42 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,336\,42\,42\]_\[168\,336\,1\,1\]_\[128\,168\,42\,42\].cu --json_file roller_nas/roller_Convolution_\[128\,336\,42\,42\]_\[168\,336\,1\,1\]_\[128\,168\,42\,42\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 168 42 42 --input1_shape 168 168 1 1 --output0_shape 128 168 42 42 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,168\,42\,42\]_\[168\,168\,1\,1\]_\[128\,168\,42\,42\]_relu.cu --json_file roller_nas/roller_Convolution_\[128\,168\,42\,42\]_\[168\,168\,1\,1\]_\[128\,168\,42\,42\]_relu.json +python parse_code.py --op_type Convolution --input0_shape 128 168 42 42 --input1_shape 84 168 1 1 --output0_shape 128 84 42 42 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,168\,42\,42\]_\[84\,168\,1\,1\]_\[128\,84\,42\,42\].cu --json_file roller_nas/roller_Convolution_\[128\,168\,42\,42\]_\[84\,168\,1\,1\]_\[128\,84\,42\,42\].json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 1008 42 42 --input1_shape 168 1008 1 1 --output0_shape 128 168 42 42 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,1008\,42\,42\]_\[168\,1008\,1\,1\]_\[128\,168\,42\,42\].cu --json_file roller_nas/roller_Convolution_\[128\,1008\,42\,42\]_\[168\,1008\,1\,1\]_\[128\,168\,42\,42\].json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 1008 42 42 --input1_shape 336 1008 1 1 --output0_shape 128 336 42 42 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,1008\,42\,42\]_\[336\,1008\,1\,1\]_\[128\,336\,42\,42\].cu --json_file roller_nas/roller_Convolution_\[128\,1008\,42\,42\]_\[336\,1008\,1\,1\]_\[128\,336\,42\,42\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 336 21 21 --input1_shape 336 336 1 1 --output0_shape 128 336 21 21 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,336\,21\,21\]_\[336\,336\,1\,1\]_\[128\,336\,21\,21\]_relu.cu --json_file roller_nas/roller_Convolution_\[128\,336\,21\,21\]_\[336\,336\,1\,1\]_\[128\,336\,21\,21\]_relu.json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 1344 21 21 --input1_shape 336 1344 1 1 --output0_shape 128 336 21 21 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,1344\,21\,21\]_\[336\,1344\,1\,1\]_\[128\,336\,21\,21\].cu --json_file roller_nas/roller_Convolution_\[128\,1344\,21\,21\]_\[336\,1344\,1\,1\]_\[128\,336\,21\,21\].json +python parse_code.py --op_type Convolution --input0_shape 128 1008 21 21 --input1_shape 168 1008 1 1 --output0_shape 128 168 21 21 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,1008\,21\,21\]_\[168\,1008\,1\,1\]_\[128\,168\,21\,21\].cu --json_file roller_nas/roller_Convolution_\[128\,1008\,21\,21\]_\[168\,1008\,1\,1\]_\[128\,168\,21\,21\].json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 2016 21 21 --input1_shape 336 2016 1 1 --output0_shape 128 336 21 21 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,2016\,21\,21\]_\[336\,2016\,1\,1\]_\[128\,336\,21\,21\].cu --json_file roller_nas/roller_Convolution_\[128\,2016\,21\,21\]_\[336\,2016\,1\,1\]_\[128\,336\,21\,21\].json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 2016 21 21 --input1_shape 672 2016 1 1 --output0_shape 128 672 21 21 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,2016\,21\,21\]_\[672\,2016\,1\,1\]_\[128\,672\,21\,21\].cu --json_file roller_nas/roller_Convolution_\[128\,2016\,21\,21\]_\[672\,2016\,1\,1\]_\[128\,672\,21\,21\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 672 11 11 --input1_shape 672 672 1 1 --output0_shape 128 672 11 11 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,672\,11\,11\]_\[672\,672\,1\,1\]_\[128\,672\,11\,11\]_relu.cu --json_file roller_nas/roller_Convolution_\[128\,672\,11\,11\]_\[672\,672\,1\,1\]_\[128\,672\,11\,11\]_relu.json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 2688 11 11 --input1_shape 672 2688 1 1 --output0_shape 128 672 11 11 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,2688\,11\,11\]_\[672\,2688\,1\,1\]_\[128\,672\,11\,11\].cu --json_file roller_nas/roller_Convolution_\[128\,2688\,11\,11\]_\[672\,2688\,1\,1\]_\[128\,672\,11\,11\].json +python parse_code.py --op_type Convolution --input0_shape 128 2016 11 11 --input1_shape 336 2016 1 1 --output0_shape 128 336 11 11 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,2016\,11\,11\]_\[336\,2016\,1\,1\]_\[128\,336\,11\,11\].cu --json_file roller_nas/roller_Convolution_\[128\,2016\,11\,11\]_\[336\,2016\,1\,1\]_\[128\,336\,11\,11\].json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 4032 11 11 --input1_shape 672 4032 1 1 --output0_shape 128 672 11 11 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Convolution_\[128\,4032\,11\,11\]_\[672\,4032\,1\,1\]_\[128\,672\,11\,11\].cu --json_file roller_nas/roller_Convolution_\[128\,4032\,11\,11\]_\[672\,4032\,1\,1\]_\[128\,672\,11\,11\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 96 165 165 --input1_shape 96 1 7 7 --output0_shape 128 96 83 83 --stride 2 2 --padding 3 3 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,96\,165\,165\]_\[7\,7\,96\,1\]_\[128\,96\,83\,83\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,96\,165\,165\]_\[7\,7\,96\,1\]_\[128\,96\,83\,83\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 42 83 83 --input1_shape 42 1 7 7 --output0_shape 128 42 83 83 --stride 1 1 --padding 3 3 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,42\,83\,83\]_\[7\,7\,42\,1\]_\[128\,42\,83\,83\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,42\,83\,83\]_\[7\,7\,42\,1\]_\[128\,42\,83\,83\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 42 165 165 --input1_shape 42 1 5 5 --output0_shape 128 42 83 83 --stride 2 2 --padding 2 2 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,42\,165\,165\]_\[5\,5\,42\,1\]_\[128\,42\,83\,83\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,42\,165\,165\]_\[5\,5\,42\,1\]_\[128\,42\,83\,83\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 42 83 83 --input1_shape 42 1 5 5 --output0_shape 128 42 83 83 --stride 1 1 --padding 2 2 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,42\,83\,83\]_\[5\,5\,42\,1\]_\[128\,42\,83\,83\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,42\,83\,83\]_\[5\,5\,42\,1\]_\[128\,42\,83\,83\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 42 83 83 --input1_shape 42 1 3 3 --output0_shape 128 42 83 83 --stride 1 1 --padding 1 1 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,42\,83\,83\]_\[3\,3\,42\,1\]_\[128\,42\,83\,83\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,42\,83\,83\]_\[3\,3\,42\,1\]_\[128\,42\,83\,83\].json # todo +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 96 165 165 --input1_shape 96 1 5 5 --output0_shape 128 96 83 83 --stride 2 2 --padding 2 2 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,96\,165\,165\]_\[5\,5\,96\,1\]_\[128\,96\,83\,83\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,96\,165\,165\]_\[5\,5\,96\,1\]_\[128\,96\,83\,83\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 84 83 83 --input1_shape 84 1 7 7 --output0_shape 128 84 42 42 --stride 2 2 --padding 3 3 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,84\,83\,83\]_\[7\,7\,84\,1\]_\[128\,84\,42\,42\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,84\,83\,83\]_\[7\,7\,84\,1\]_\[128\,84\,42\,42\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 84 42 42 --input1_shape 84 1 7 7 --output0_shape 128 84 42 42 --stride 1 1 --padding 3 3 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,84\,42\,42\]_\[7\,7\,84\,1\]_\[128\,84\,42\,42\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,84\,42\,42\]_\[7\,7\,84\,1\]_\[128\,84\,42\,42\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 84 83 83 --input1_shape 84 1 5 5 --output0_shape 128 84 42 42 --stride 2 2 --padding 2 2 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,84\,83\,83\]_\[5\,5\,84\,1\]_\[128\,84\,42\,42\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,84\,83\,83\]_\[5\,5\,84\,1\]_\[128\,84\,42\,42\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 84 42 42 --input1_shape 84 1 5 5 --output0_shape 128 84 42 42 --stride 1 1 --padding 2 2 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,84\,42\,42\]_\[5\,5\,84\,1\]_\[128\,84\,42\,42\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,84\,42\,42\]_\[5\,5\,84\,1\]_\[128\,84\,42\,42\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 84 42 42 --input1_shape 84 1 3 3 --output0_shape 128 84 42 42 --stride 1 1 --padding 1 1 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,84\,42\,42\]_\[3\,3\,84\,1\]_\[128\,84\,42\,42\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,84\,42\,42\]_\[3\,3\,84\,1\]_\[128\,84\,42\,42\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 168 42 42 --input1_shape 168 1 3 3 --output0_shape 128 168 42 42 --stride 1 1 --padding 1 1 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,168\,42\,42\]_\[3\,3\,168\,1\]_\[128\,168\,42\,42\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,168\,42\,42\]_\[3\,3\,168\,1\]_\[128\,168\,42\,42\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 168 42 42 --input1_shape 168 1 5 5 --output0_shape 128 168 42 42 --stride 1 1 --padding 2 2 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,168\,42\,42\]_\[5\,5\,168\,1\]_\[128\,168\,42\,42\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,168\,42\,42\]_\[5\,5\,168\,1\]_\[128\,168\,42\,42\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 336 47 47 --input1_shape 336 1 7 7 --output0_shape 128 336 21 21 --stride 2 2 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,336\,47\,47\]_\[7\,7\,336\,1\]_\[128\,336\,21\,21\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,336\,47\,47\]_\[7\,7\,336\,1\]_\[128\,336\,21\,21\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 336 21 21 --input1_shape 336 1 7 7 --output0_shape 128 336 21 21 --stride 1 1 --padding 3 3 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,336\,21\,21\]_\[7\,7\,336\,1\]_\[128\,336\,21\,21\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,336\,21\,21\]_\[7\,7\,336\,1\]_\[128\,336\,21\,21\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 336 45 45 --input1_shape 336 1 5 5 --output0_shape 128 336 21 21 --stride 2 2 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,336\,45\,45\]_\[5\,5\,336\,1\]_\[128\,336\,21\,21\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,336\,45\,45\]_\[5\,5\,336\,1\]_\[128\,336\,21\,21\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 336 21 21 --input1_shape 336 1 5 5 --output0_shape 128 336 21 21 --stride 1 1 --padding 2 2 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,336\,21\,21\]_\[5\,5\,336\,1\]_\[128\,336\,21\,21\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,336\,21\,21\]_\[5\,5\,336\,1\]_\[128\,336\,21\,21\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 336 21 21 --input1_shape 336 1 3 3 --output0_shape 128 336 21 21 --stride 1 1 --padding 1 1 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,336\,21\,21\]_\[3\,3\,336\,1\]_\[128\,336\,21\,21\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,336\,21\,21\]_\[3\,3\,336\,1\]_\[128\,336\,21\,21\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 672 21 21 --input1_shape 672 1 7 7 --output0_shape 128 672 11 11 --stride 2 2 --padding 3 3 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,672\,21\,21\]_\[7\,7\,672\,1\]_\[128\,672\,11\,11\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,672\,21\,21\]_\[7\,7\,672\,1\]_\[128\,672\,11\,11\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 672 11 11 --input1_shape 672 1 7 7 --output0_shape 128 672 11 11 --stride 1 1 --padding 3 3 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,672\,11\,11\]_\[7\,7\,672\,1\]_\[128\,672\,11\,11\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,672\,11\,11\]_\[7\,7\,672\,1\]_\[128\,672\,11\,11\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 672 21 21 --input1_shape 672 1 5 5 --output0_shape 128 672 11 11 --stride 2 2 --padding 2 2 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,672\,21\,21\]_\[5\,5\,672\,1\]_\[128\,672\,11\,11\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,672\,21\,21\]_\[5\,5\,672\,1\]_\[128\,672\,11\,11\].json #todo +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 672 11 11 --input1_shape 672 1 5 5 --output0_shape 128 672 11 11 --stride 1 1 --padding 2 2 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,672\,11\,11\]_\[5\,5\,672\,1\]_\[128\,672\,11\,11\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,672\,11\,11\]_\[5\,5\,672\,1\]_\[128\,672\,11\,11\].json +python parse_code.py --op_type DepthwiseConv2dNative --input0_shape 128 672 11 11 --input1_shape 672 1 3 3 --output0_shape 128 672 11 11 --stride 1 1 --padding 1 1 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_DepthwiseConv2dNative_\[128\,672\,11\,11\]_\[3\,3\,672\,1\]_\[128\,672\,11\,11\].cu --json_file roller_nas/roller_DepthwiseConv2dNative_\[128\,672\,11\,11\]_\[3\,3\,672\,1\]_\[128\,672\,11\,11\].json +python parse_code.py --op_type Dot --input0_shape 128 4032 --input1_shape 4032 1000 --output0_shape 128 1000 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Dot_\[128\,4032\]_\[4032\,1000\]_\[128\,1000\].cu --json_file roller_nas/roller_Dot_\[128\,4032\]_\[4032\,1000\]_\[128\,1000\].json +python parse_code.py --op_type Sum --input0_shape 128 4032 11 11 --output0_shape 128 4032 --reduction_axis 2 3 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/nas_bs128/roller_Sum_\[128\,4032\,11\,11\]_\[128\,4032\].cu --json_file roller_nas/roller_Sum_\[128\,4032\,11\,11\]_\[128\,4032\].json + +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 672 11 11 --input1_shape 672 672 1 1 --output0_shape 128 672 11 11 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/yuqxia/repo/nnfusion/src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_\[128\,672\,11\,11\]_\[672\,672\,1\,1\]_\[128\,672\,11\,11\].cu --json_file roller_nas/roller_Convolution_\[128\,672\,11\,11\]_\[672\,672\,1\,1\]_\[128\,672\,11\,11\]_bias.json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 336 21 21 --input1_shape 336 336 1 1 --output0_shape 128 336 21 21 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/yuqxia/repo/nnfusion/src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_\[128\,336\,21\,21\]_\[336\,336\,1\,1\]_\[128\,336\,21\,21\].cu --json_file roller_nas/roller_Convolution_\[128\,336\,21\,21\]_\[336\,336\,1\,1\]_\[128\,336\,21\,21\]_bias.json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 84 42 42 --input1_shape 84 84 1 1 --output0_shape 128 84 42 42 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/yuqxia/repo/nnfusion/src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_\[128\,84\,42\,42\]_\[84\,84\,1\,1\]_\[128\,84\,42\,42\].cu --json_file roller_nas/roller_Convolution_\[128\,84\,42\,42\]_\[84\,84\,1\,1\]_\[128\,84\,42\,42\]_bias.json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 168 42 42 --input1_shape 168 168 1 1 --output0_shape 128 168 42 42 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/yuqxia/repo/nnfusion/src/tools/nnfusion/kernel_db/nas_conv/roller_Convolution_\[128\,168\,42\,42\]_\[168\,168\,1\,1\]_\[128\,168\,42\,42\].cu --json_file roller_nas/roller_Convolution_\[128\,168\,42\,42\]_\[168\,168\,1\,1\]_\[128\,168\,42\,42\]_bias.json diff --git a/src/tools/nnfusion/kernel_db/parse_res.sh b/src/tools/nnfusion/kernel_db/parse_res.sh new file mode 100755 index 000000000..432fb7e48 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/parse_res.sh @@ -0,0 +1,26 @@ +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 3 230 230 --input1_shape 64 3 7 7 --output0_shape 128 64 112 112 --stride 2 2 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,3\,230\,230\]_\[64\,3\,7\,7\]_\[128\,64\,112\,112\].cu --json_file roller_res/roller_Convolution_\[128\,3\,230\,230\]_\[64\,3\,7\,7\]_\[128\,64\,112\,112\].json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 64 56 56 --input1_shape 256 64 1 1 --output0_shape 128 256 56 56 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,64\,56\,56\]_\[256\,64\,1\,1\]_\[128\,256\,56\,56\].cu --json_file roller_res/roller_Convolution_\[128\,64\,56\,56\]_\[256\,64\,1\,1\]_\[128\,256\,56\,56\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 64 56 56 --input1_shape 64 64 1 1 --output0_shape 128 64 56 56 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,64\,56\,56\]_\[64\,64\,1\,1\]_\[128\,64\,56\,56\].cu --json_file roller_res/roller_Convolution_\[128\,64\,56\,56\]_\[64\,64\,1\,1\]_\[128\,64\,56\,56\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 64 56 56 --input1_shape 64 64 3 3 --output0_shape 128 64 56 56 --stride 1 1 --padding 1 1 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,64\,56\,56\]_\[64\,64\,3\,3\]_\[128\,64\,56\,56\].cu --json_file roller_res/roller_Convolution_\[128\,64\,56\,56\]_\[64\,64\,3\,3\]_\[128\,64\,56\,56\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 256 56 56 --input1_shape 64 256 1 1 --output0_shape 128 64 56 56 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,256\,56\,56\]_\[64\,256\,1\,1\]_\[128\,64\,56\,56\].cu --json_file roller_res/roller_Convolution_\[128\,256\,56\,56\]_\[64\,256\,1\,1\]_\[128\,64\,56\,56\].json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 256 56 56 --input1_shape 512 256 1 1 --output0_shape 128 512 28 28 --stride 2 2 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,256\,56\,56\]_\[512\,256\,1\,1\]_\[128\,512\,28\,28\].cu --json_file roller_res/roller_Convolution_\[128\,256\,56\,56\]_\[512\,256\,1\,1\]_\[128\,512\,28\,28\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 256 56 56 --input1_shape 128 256 1 1 --output0_shape 128 128 56 56 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,256\,56\,56\]_\[128\,256\,1\,1\]_\[128\,128\,28\,28\].cu --json_file roller_res/roller_Convolution_\[128\,256\,56\,56\]_\[128\,256\,1\,1\]_\[128\,128\,28\,28\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 128 58 58 --input1_shape 128 128 3 3 --output0_shape 128 128 28 28 --stride 2 2 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,128\,58\,58\]_\[128\,128\,3\,3\]_\[128\,128\,28\,28\].cu --json_file roller_res/roller_Convolution_\[128\,128\,58\,58\]_\[128\,128\,3\,3\]_\[128\,128\,28\,28\].json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 128 28 28 --input1_shape 512 128 1 1 --output0_shape 128 512 28 28 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,128\,28\,28\]_\[512\,128\,1\,1\]_\[128\,512\,28\,28\].cu --json_file roller_res/roller_Convolution_\[128\,128\,28\,28\]_\[512\,128\,1\,1\]_\[128\,512\,28\,28\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 512 28 28 --input1_shape 128 512 1 1 --output0_shape 128 128 28 28 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,512\,28\,28\]_\[128\,512\,1\,1\]_\[128\,128\,28\,28\].cu --json_file roller_res/roller_Convolution_\[128\,512\,28\,28\]_\[128\,512\,1\,1\]_\[128\,128\,28\,28\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 128 28 28 --input1_shape 128 128 3 3 --output0_shape 128 128 28 28 --stride 1 1 --padding 1 1 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,128\,28\,28\]_\[128\,128\,3\,3\]_\[128\,128\,28\,28\].cu --json_file roller_res/roller_Convolution_\[128\,128\,28\,28\]_\[128\,128\,3\,3\]_\[128\,128\,28\,28\].json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 512 28 28 --input1_shape 1024 512 1 1 --output0_shape 128 1024 14 14 --stride 2 2 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,512\,28\,28\]_\[1024\,512\,1\,1\]_\[128\,1024\,14\,14\].cu --json_file roller_res/roller_Convolution_\[128\,512\,28\,28\]_\[1024\,512\,1\,1\]_\[128\,1024\,14\,14\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 512 28 28 --input1_shape 256 512 1 1 --output0_shape 128 256 28 28 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,512\,28\,28\]_\[256\,512\,1\,1\]_\[128\,256\,28\,28\].cu --json_file roller_res/roller_Convolution_\[128\,512\,28\,28\]_\[256\,512\,1\,1\]_\[128\,256\,28\,28\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 256 30 30 --input1_shape 256 256 3 3 --output0_shape 128 256 14 14 --stride 2 2 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,256\,30\,30\]_\[256\,256\,3\,3\]_\[128\,256\,14\,14\].cu --json_file roller_res/roller_Convolution_\[128\,256\,30\,30\]_\[256\,256\,3\,3\]_\[128\,256\,14\,14\].json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 256 14 14 --input1_shape 1024 256 1 1 --output0_shape 128 1024 14 14 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,256\,14\,14\]_\[1024\,256\,1\,1\]_\[128\,1024\,14\,14\].cu --json_file roller_res/roller_Convolution_\[128\,256\,14\,14\]_\[1024\,256\,1\,1\]_\[128\,1024\,14\,14\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 1024 14 14 --input1_shape 256 1024 1 1 --output0_shape 128 256 14 14 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,1024\,14\,14\]_\[256\,1024\,1\,1\]_\[128\,256\,14\,14\].cu --json_file roller_res/roller_Convolution_\[128\,1024\,14\,14\]_\[256\,1024\,1\,1\]_\[128\,256\,14\,14\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 256 14 14 --input1_shape 256 256 3 3 --output0_shape 128 256 14 14 --stride 1 1 --padding 1 1 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,256\,14\,14\]_\[256\,256\,3\,3\]_\[128\,256\,14\,14\].cu --json_file roller_res/roller_Convolution_\[128\,256\,14\,14\]_\[256\,256\,3\,3\]_\[128\,256\,14\,14\].json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 1024 14 14 --input1_shape 2048 1024 1 1 --output0_shape 128 2048 7 7 --stride 2 2 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,1024\,14\,14\]_\[2048\,1024\,1\,1\]_\[128\,2048\,7\,7\].cu --json_file roller_res/roller_Convolution_\[128\,1024\,14\,14\]_\[2048\,1024\,1\,1\]_\[128\,2048\,7\,7\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 1024 14 14 --input1_shape 512 1024 1 1 --output0_shape 128 512 14 14 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,1024\,14\,14\]_\[512\,1024\,1\,1\]_\[128\,512\,14\,14\].cu --json_file roller_res/roller_Convolution_\[128\,1024\,14\,14\]_\[512\,1024\,1\,1\]_\[128\,512\,14\,14\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 512 16 16 --input1_shape 512 512 3 3 --output0_shape 128 512 7 7 --stride 2 2 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,512\,16\,16\]_\[512\,512\,3\,3\]_\[128\,512\,7\,7\].cu --json_file roller_res/roller_Convolution_\[128\,512\,16\,16\]_\[512\,512\,3\,3\]_\[128\,512\,7\,7\].json +python parse_code.py --op_type Fused_Convolution_Add --input0_shape 128 512 7 7 --input1_shape 2048 512 1 1 --output0_shape 128 2048 7 7 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,512\,7\,7\]_\[2048\,512\,1\,1\]_\[128\,2048\,7\,7\].cu --json_file roller_res/roller_Convolution_\[128\,512\,7\,7\]_\[2048\,512\,1\,1\]_\[128\,2048\,7\,7\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 2048 7 7 --input1_shape 512 2048 1 1 --output0_shape 128 512 7 7 --stride 1 1 --padding 0 0 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,2048\,7\,7\]_\[512\,2048\,1\,1\]_\[128\,512\,7\,7\].cu --json_file roller_res/roller_Convolution_\[128\,2048\,7\,7\]_\[512\,2048\,1\,1\]_\[128\,512\,7\,7\].json +python parse_code.py --op_type Fused_Convolution_Add_Relu --input0_shape 128 512 7 7 --input1_shape 512 512 3 3 --output0_shape 128 512 7 7 --stride 1 1 --padding 1 1 --dilation 1 1 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Convolution_\[128\,512\,7\,7\]_\[512\,512\,3\,3\]_\[128\,512\,7\,7\].cu --json_file roller_res/roller_Convolution_\[128\,512\,7\,7\]_\[512\,512\,3\,3\]_\[128\,512\,7\,7\].json +python parse_code.py --op_type Dot --input0_shape 128 2048 --input1_shape 2048 1000 --output0_shape 128 1000 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Dot_\[128\,2048\]_\[2048\,1000\]_\[128\,1000\].cu --json_file roller_res/roller_Dot_\[128\,2048\]_\[2048\,1000\]_\[128\,1000\].json +python parse_code.py --op_type MaxPool --input0_shape 128 64 112 112 --output0_shape 128 64 56 56 --window_shape 3 3 --stride 2 2 --padding 0 0 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_MaxPool_\[128\,64\,112\,112\]_\[128\,64\,56\,56\].cu --json_file roller_res/roller_MaxPool_\[128\,64\,112\,112\]_\[128\,64\,56\,56\].json +python parse_code.py --op_type Sum --input0_shape 128 2048 7 7 --output0_shape 128 2048 --reduction_axis 2 3 --source_file /home/v-zhuho/new_repo/TiledCompiler/tiled-compiler/e2e_kernels/res_bs128/roller_Sum_\[128\,2048\,7\,7\]_\[128\,2048\].cu --json_file roller_res/roller_Sum_\[128\,2048\,7\,7\]_\[128\,2048\].json \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/parser.out b/src/tools/nnfusion/kernel_db/parser.out new file mode 100644 index 000000000..1591c1173 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/parser.out @@ -0,0 +1,317 @@ +Created by PLY version 3.11 (http://www.dabeaz.com/ply) + +Unused terminals: + + CHAR + COMMENT1 + COMMENT2 + DPOUND + ELSE + FLOAT + FOR + IF + POUND + STRING + SYNC + WS + +Grammar + +Rule 0 S' -> start +Rule 1 start -> signature +Rule 2 start -> shared +Rule 3 signature -> GLOBAL VOID ID ( parameters ) +Rule 4 parameters -> parameter +Rule 5 parameters -> parameters , parameter +Rule 6 parameter -> type ID +Rule 7 parameter -> type QUALIFIER ID +Rule 8 parameter -> QUALIFIER type ID +Rule 9 type -> TYPE +Rule 10 type -> TYPE * +Rule 11 shared -> SHARED TYPE ID [ INTEGER ] ; + +Terminals, with rules where they appear + +( : 3 +) : 3 +* : 10 +, : 5 +; : 11 +CHAR : +COMMENT1 : +COMMENT2 : +DPOUND : +ELSE : +FLOAT : +FOR : +GLOBAL : 3 +ID : 3 6 7 8 11 +IF : +INTEGER : 11 +POUND : +QUALIFIER : 7 8 +SHARED : 11 +STRING : +SYNC : +TYPE : 9 10 11 +VOID : 3 +WS : +[ : 11 +] : 11 +error : + +Nonterminals, with rules where they appear + +parameter : 4 5 +parameters : 3 5 +shared : 2 +signature : 1 +start : 0 +type : 6 7 8 + +Parsing method: LALR + +state 0 + + (0) S' -> . start + (1) start -> . signature + (2) start -> . shared + (3) signature -> . GLOBAL VOID ID ( parameters ) + (11) shared -> . SHARED TYPE ID [ INTEGER ] ; + + GLOBAL shift and go to state 4 + SHARED shift and go to state 5 + + start shift and go to state 1 + signature shift and go to state 2 + shared shift and go to state 3 + +state 1 + + (0) S' -> start . + + + +state 2 + + (1) start -> signature . + + $end reduce using rule 1 (start -> signature .) + + +state 3 + + (2) start -> shared . + + $end reduce using rule 2 (start -> shared .) + + +state 4 + + (3) signature -> GLOBAL . VOID ID ( parameters ) + + VOID shift and go to state 6 + + +state 5 + + (11) shared -> SHARED . TYPE ID [ INTEGER ] ; + + TYPE shift and go to state 7 + + +state 6 + + (3) signature -> GLOBAL VOID . ID ( parameters ) + + ID shift and go to state 8 + + +state 7 + + (11) shared -> SHARED TYPE . ID [ INTEGER ] ; + + ID shift and go to state 9 + + +state 8 + + (3) signature -> GLOBAL VOID ID . ( parameters ) + + ( shift and go to state 10 + + +state 9 + + (11) shared -> SHARED TYPE ID . [ INTEGER ] ; + + [ shift and go to state 11 + + +state 10 + + (3) signature -> GLOBAL VOID ID ( . parameters ) + (4) parameters -> . parameter + (5) parameters -> . parameters , parameter + (6) parameter -> . type ID + (7) parameter -> . type QUALIFIER ID + (8) parameter -> . QUALIFIER type ID + (9) type -> . TYPE + (10) type -> . TYPE * + + QUALIFIER shift and go to state 15 + TYPE shift and go to state 16 + + parameters shift and go to state 12 + parameter shift and go to state 13 + type shift and go to state 14 + +state 11 + + (11) shared -> SHARED TYPE ID [ . INTEGER ] ; + + INTEGER shift and go to state 17 + + +state 12 + + (3) signature -> GLOBAL VOID ID ( parameters . ) + (5) parameters -> parameters . , parameter + + ) shift and go to state 18 + , shift and go to state 19 + + +state 13 + + (4) parameters -> parameter . + + ) reduce using rule 4 (parameters -> parameter .) + , reduce using rule 4 (parameters -> parameter .) + + +state 14 + + (6) parameter -> type . ID + (7) parameter -> type . QUALIFIER ID + + ID shift and go to state 20 + QUALIFIER shift and go to state 21 + + +state 15 + + (8) parameter -> QUALIFIER . type ID + (9) type -> . TYPE + (10) type -> . TYPE * + + TYPE shift and go to state 16 + + type shift and go to state 22 + +state 16 + + (9) type -> TYPE . + (10) type -> TYPE . * + + ID reduce using rule 9 (type -> TYPE .) + QUALIFIER reduce using rule 9 (type -> TYPE .) + * shift and go to state 23 + + +state 17 + + (11) shared -> SHARED TYPE ID [ INTEGER . ] ; + + ] shift and go to state 24 + + +state 18 + + (3) signature -> GLOBAL VOID ID ( parameters ) . + + $end reduce using rule 3 (signature -> GLOBAL VOID ID ( parameters ) .) + + +state 19 + + (5) parameters -> parameters , . parameter + (6) parameter -> . type ID + (7) parameter -> . type QUALIFIER ID + (8) parameter -> . QUALIFIER type ID + (9) type -> . TYPE + (10) type -> . TYPE * + + QUALIFIER shift and go to state 15 + TYPE shift and go to state 16 + + parameter shift and go to state 25 + type shift and go to state 14 + +state 20 + + (6) parameter -> type ID . + + ) reduce using rule 6 (parameter -> type ID .) + , reduce using rule 6 (parameter -> type ID .) + + +state 21 + + (7) parameter -> type QUALIFIER . ID + + ID shift and go to state 26 + + +state 22 + + (8) parameter -> QUALIFIER type . ID + + ID shift and go to state 27 + + +state 23 + + (10) type -> TYPE * . + + ID reduce using rule 10 (type -> TYPE * .) + QUALIFIER reduce using rule 10 (type -> TYPE * .) + + +state 24 + + (11) shared -> SHARED TYPE ID [ INTEGER ] . ; + + ; shift and go to state 28 + + +state 25 + + (5) parameters -> parameters , parameter . + + ) reduce using rule 5 (parameters -> parameters , parameter .) + , reduce using rule 5 (parameters -> parameters , parameter .) + + +state 26 + + (7) parameter -> type QUALIFIER ID . + + ) reduce using rule 7 (parameter -> type QUALIFIER ID .) + , reduce using rule 7 (parameter -> type QUALIFIER ID .) + + +state 27 + + (8) parameter -> QUALIFIER type ID . + + ) reduce using rule 8 (parameter -> QUALIFIER type ID .) + , reduce using rule 8 (parameter -> QUALIFIER type ID .) + + +state 28 + + (11) shared -> SHARED TYPE ID [ INTEGER ] ; . + + $end reduce using rule 11 (shared -> SHARED TYPE ID [ INTEGER ] ; .) + diff --git a/src/tools/nnfusion/kernel_db/parsetab.py b/src/tools/nnfusion/kernel_db/parsetab.py new file mode 100644 index 000000000..9931d37e3 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/parsetab.py @@ -0,0 +1,41 @@ + +# parsetab.py +# This file is automatically generated. Do not edit. +# pylint: disable=W,C,R +_tabversion = '3.10' + +_lr_method = 'LALR' + +_lr_signature = "CHAR COMMENT1 COMMENT2 DPOUND ELSE FLOAT FOR GLOBAL ID IF INTEGER POUND QUALIFIER SHARED STRING SYNC TYPE VOID WSstart : signature \n | shared\n signature : GLOBAL VOID ID '(' parameters ')'parameters : parameter\n | parameters ',' parameter\n parameter : type ID\n | type QUALIFIER ID\n | QUALIFIER type ID\n type : TYPE \n | TYPE '*'\n shared : SHARED TYPE ID '[' INTEGER ']' ';' " + +_lr_action_items = {'GLOBAL':([0,],[4,]),'SHARED':([0,],[5,]),'$end':([1,2,3,18,28,],[0,-1,-2,-3,-11,]),'VOID':([4,],[6,]),'TYPE':([5,10,15,19,],[7,16,16,16,]),'ID':([6,7,14,16,21,22,23,],[8,9,20,-9,26,27,-10,]),'(':([8,],[10,]),'[':([9,],[11,]),'QUALIFIER':([10,14,16,19,23,],[15,21,-9,15,-10,]),'INTEGER':([11,],[17,]),')':([12,13,20,25,26,27,],[18,-4,-6,-5,-7,-8,]),',':([12,13,20,25,26,27,],[19,-4,-6,-5,-7,-8,]),'*':([16,],[23,]),']':([17,],[24,]),';':([24,],[28,]),} + +_lr_action = {} +for _k, _v in _lr_action_items.items(): + for _x,_y in zip(_v[0],_v[1]): + if not _x in _lr_action: _lr_action[_x] = {} + _lr_action[_x][_k] = _y +del _lr_action_items + +_lr_goto_items = {'start':([0,],[1,]),'signature':([0,],[2,]),'shared':([0,],[3,]),'parameters':([10,],[12,]),'parameter':([10,19,],[13,25,]),'type':([10,15,19,],[14,22,14,]),} + +_lr_goto = {} +for _k, _v in _lr_goto_items.items(): + for _x, _y in zip(_v[0], _v[1]): + if not _x in _lr_goto: _lr_goto[_x] = {} + _lr_goto[_x][_k] = _y +del _lr_goto_items +_lr_productions = [ + ("S' -> start","S'",1,None,None,None), + ('start -> signature','start',1,'p_start','cuparse.py',129), + ('start -> shared','start',1,'p_start','cuparse.py',130), + ('signature -> GLOBAL VOID ID ( parameters )','signature',6,'p_signature','cuparse.py',135), + ('parameters -> parameter','parameters',1,'p_parameters','cuparse.py',140), + ('parameters -> parameters , parameter','parameters',3,'p_parameters','cuparse.py',141), + ('parameter -> type ID','parameter',2,'p_parameter','cuparse.py',146), + ('parameter -> type QUALIFIER ID','parameter',3,'p_parameter','cuparse.py',147), + ('parameter -> QUALIFIER type ID','parameter',3,'p_parameter','cuparse.py',148), + ('type -> TYPE','type',1,'p_type','cuparse.py',158), + ('type -> TYPE *','type',2,'p_type','cuparse.py',159), + ('shared -> SHARED TYPE ID [ INTEGER ] ;','shared',7,'p_shared','cuparse.py',166), +] diff --git a/src/tools/nnfusion/kernel_db/roller_bert/convert.sh b/src/tools/nnfusion/kernel_db/roller_bert/convert.sh new file mode 100755 index 000000000..e1147d5b4 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_bert/convert.sh @@ -0,0 +1,11 @@ +# python ../convert_external.py roller_BatchMatMul_[128,16,512,512]_[128,16,512,64]_[128,16,512,64].json +# python ../convert_external.py roller_BatchMatMul_[128,16,512,64]_[128,16,512,64]_[128,16,512,512].json +# # python ../convert_external.py roller_Broadcast_[128,512]_[128,512,512].json +# # python ../convert_external.py roller_Broadcast_[128,512,512]_[128,16,512,512].json +# python ../convert_external.py roller_Dot_[65536,1024]_[1024,1024]_[65536,1024].json +# python ../convert_external.py roller_Dot_[65536,1024]_[1024,4096]_[65536,1024].json +# python ../convert_external.py roller_Dot_[65536,2]_[2,1024]_[65536,1024].json +# python ../convert_external.py roller_Dot_[65536,30522]_[30522,1024]_[65536,1024].json +# python ../convert_external.py roller_Dot_[65536,4096]_[4096,1024]_[65536,1024].json +python ../convert_external.py roller_Sum_[128,512,1024]_[128,512].json +python ../convert_external.py roller_Sum_[65536,1024]_[65536].json \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_bert/convert_ansor.sh b/src/tools/nnfusion/kernel_db/roller_bert/convert_ansor.sh new file mode 100755 index 000000000..32b59f703 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_bert/convert_ansor.sh @@ -0,0 +1,7 @@ +python ../convert_external.py /home/shanbinke/TiledCompiler/tiled-compiler/microbenchmark/tvm/e2e/ansor/ansor_batch_matmul_128_16_512_512_64.json +python ../convert_external.py /home/shanbinke/TiledCompiler/tiled-compiler/microbenchmark/tvm/e2e/ansor/ansor_batch_matmul_128_16_512_64_512.json +python ../convert_external.py /home/shanbinke/TiledCompiler/tiled-compiler/microbenchmark/tvm/e2e/ansor/ansor_matmul_65536_1024_1024.json +python ../convert_external.py /home/shanbinke/TiledCompiler/tiled-compiler/microbenchmark/tvm/e2e/ansor/ansor_matmul_65536_1024_4096.json +python ../convert_external.py /home/shanbinke/TiledCompiler/tiled-compiler/microbenchmark/tvm/e2e/ansor/ansor_matmul_65536_2_1024.json +python ../convert_external.py /home/shanbinke/TiledCompiler/tiled-compiler/microbenchmark/tvm/e2e/ansor/ansor_matmul_65536_30522_1024.json +python ../convert_external.py /home/shanbinke/TiledCompiler/tiled-compiler/microbenchmark/tvm/e2e/ansor/ansor_matmul_65536_4096_1024.json diff --git a/src/tools/nnfusion/kernel_db/roller_bert/roller_BatchMatMul_[128,16,512,512]_[128,16,512,64]_[128,16,512,64].json b/src/tools/nnfusion/kernel_db/roller_bert/roller_BatchMatMul_[128,16,512,512]_[128,16,512,64]_[128,16,512,64].json new file mode 100644 index 000000000..bd44621fd --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_bert/roller_BatchMatMul_[128,16,512,512]_[128,16,512,64]_[128,16,512,64].json @@ -0,0 +1 @@ +{"parameters": {"arg0_shape": [128, 16, 512, 512], "arg1_shape": [128, 16, 512, 64], "out_shape": [128, 16, 512, 64], "transpose_A": false, "transpose_B": false}, "op_type": "BatchMatMul", "tvm_func_name": "roller_BatchMatMul__128_16_512_512___128_16_512_64___128_16_512_64_", "code": "extern \"C\" __global__ void roller_BatchMatMul__128_16_512_512___128_16_512_64___128_16_512_64_(float* __restrict__ A, float* __restrict__ B, float* __restrict__ compute) {\n float compute_local[32];\n __shared__ float A_shared[4224];\n __shared__ float B_shared[2048];\n float A_shared_local[4];\n float B_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 16; ++k_outer) {\n __syncthreads();\n A_shared[((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)))] = A[(((((((int)blockIdx.x) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 264))] = A[((((((((int)blockIdx.x) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4096))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 528))] = A[((((((((int)blockIdx.x) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 792))] = A[((((((((int)blockIdx.x) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12288))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1056))] = A[((((((((int)blockIdx.x) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1320))] = A[((((((((int)blockIdx.x) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 20480))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1584))] = A[((((((((int)blockIdx.x) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24576))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1848))] = A[((((((((int)blockIdx.x) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 28672))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2112))] = A[((((((((int)blockIdx.x) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32768))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2376))] = A[((((((((int)blockIdx.x) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 36864))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2640))] = A[((((((((int)blockIdx.x) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 40960))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2904))] = A[((((((((int)blockIdx.x) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 45056))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3168))] = A[((((((((int)blockIdx.x) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 49152))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3432))] = A[((((((((int)blockIdx.x) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 53248))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3696))] = A[((((((((int)blockIdx.x) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 57344))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3960))] = A[((((((((int)blockIdx.x) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 61440))];\n B_shared[(((int)threadIdx.x))] = B[(((((((int)blockIdx.x) >> 2) * 32768) + (k_outer * 2048)) + ((int)threadIdx.x)))];\n B_shared[((((int)threadIdx.x) + 256))] = B[((((((((int)blockIdx.x) >> 2) * 32768) + (k_outer * 2048)) + ((int)threadIdx.x)) + 256))];\n B_shared[((((int)threadIdx.x) + 512))] = B[((((((((int)blockIdx.x) >> 2) * 32768) + (k_outer * 2048)) + ((int)threadIdx.x)) + 512))];\n B_shared[((((int)threadIdx.x) + 768))] = B[((((((((int)blockIdx.x) >> 2) * 32768) + (k_outer * 2048)) + ((int)threadIdx.x)) + 768))];\n B_shared[((((int)threadIdx.x) + 1024))] = B[((((((((int)blockIdx.x) >> 2) * 32768) + (k_outer * 2048)) + ((int)threadIdx.x)) + 1024))];\n B_shared[((((int)threadIdx.x) + 1280))] = B[((((((((int)blockIdx.x) >> 2) * 32768) + (k_outer * 2048)) + ((int)threadIdx.x)) + 1280))];\n B_shared[((((int)threadIdx.x) + 1536))] = B[((((((((int)blockIdx.x) >> 2) * 32768) + (k_outer * 2048)) + ((int)threadIdx.x)) + 1536))];\n B_shared[((((int)threadIdx.x) + 1792))] = B[((((((((int)blockIdx.x) >> 2) * 32768) + (k_outer * 2048)) + ((int)threadIdx.x)) + 1792))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n A_shared_local[(0)] = A_shared[((((((int)threadIdx.x) >> 3) * 33) + k_inner_outer))];\n A_shared_local[(1)] = A_shared[(((((((int)threadIdx.x) >> 3) * 33) + k_inner_outer) + 1056))];\n A_shared_local[(2)] = A_shared[(((((((int)threadIdx.x) >> 3) * 33) + k_inner_outer) + 2112))];\n A_shared_local[(3)] = A_shared[(((((((int)threadIdx.x) >> 3) * 33) + k_inner_outer) + 3168))];\n B_shared_local[(0)] = B_shared[(((k_inner_outer * 64) + (((int)threadIdx.x) & 7)))];\n B_shared_local[(1)] = B_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 8))];\n B_shared_local[(2)] = B_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 16))];\n B_shared_local[(3)] = B_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 24))];\n B_shared_local[(4)] = B_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 32))];\n B_shared_local[(5)] = B_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 40))];\n B_shared_local[(6)] = B_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 48))];\n B_shared_local[(7)] = B_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 56))];\n compute_local[(0)] = (compute_local[(0)] + (A_shared_local[(0)] * B_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (A_shared_local[(1)] * B_shared_local[(0)]));\n compute_local[(16)] = (compute_local[(16)] + (A_shared_local[(2)] * B_shared_local[(0)]));\n compute_local[(24)] = (compute_local[(24)] + (A_shared_local[(3)] * B_shared_local[(0)]));\n compute_local[(1)] = (compute_local[(1)] + (A_shared_local[(0)] * B_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (A_shared_local[(1)] * B_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (A_shared_local[(2)] * B_shared_local[(1)]));\n compute_local[(25)] = (compute_local[(25)] + (A_shared_local[(3)] * B_shared_local[(1)]));\n compute_local[(2)] = (compute_local[(2)] + (A_shared_local[(0)] * B_shared_local[(2)]));\n compute_local[(10)] = (compute_local[(10)] + (A_shared_local[(1)] * B_shared_local[(2)]));\n compute_local[(18)] = (compute_local[(18)] + (A_shared_local[(2)] * B_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (A_shared_local[(3)] * B_shared_local[(2)]));\n compute_local[(3)] = (compute_local[(3)] + (A_shared_local[(0)] * B_shared_local[(3)]));\n compute_local[(11)] = (compute_local[(11)] + (A_shared_local[(1)] * B_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (A_shared_local[(2)] * B_shared_local[(3)]));\n compute_local[(27)] = (compute_local[(27)] + (A_shared_local[(3)] * B_shared_local[(3)]));\n compute_local[(4)] = (compute_local[(4)] + (A_shared_local[(0)] * B_shared_local[(4)]));\n compute_local[(12)] = (compute_local[(12)] + (A_shared_local[(1)] * B_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (A_shared_local[(2)] * B_shared_local[(4)]));\n compute_local[(28)] = (compute_local[(28)] + (A_shared_local[(3)] * B_shared_local[(4)]));\n compute_local[(5)] = (compute_local[(5)] + (A_shared_local[(0)] * B_shared_local[(5)]));\n compute_local[(13)] = (compute_local[(13)] + (A_shared_local[(1)] * B_shared_local[(5)]));\n compute_local[(21)] = (compute_local[(21)] + (A_shared_local[(2)] * B_shared_local[(5)]));\n compute_local[(29)] = (compute_local[(29)] + (A_shared_local[(3)] * B_shared_local[(5)]));\n compute_local[(6)] = (compute_local[(6)] + (A_shared_local[(0)] * B_shared_local[(6)]));\n compute_local[(14)] = (compute_local[(14)] + (A_shared_local[(1)] * B_shared_local[(6)]));\n compute_local[(22)] = (compute_local[(22)] + (A_shared_local[(2)] * B_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (A_shared_local[(3)] * B_shared_local[(6)]));\n compute_local[(7)] = (compute_local[(7)] + (A_shared_local[(0)] * B_shared_local[(7)]));\n compute_local[(15)] = (compute_local[(15)] + (A_shared_local[(1)] * B_shared_local[(7)]));\n compute_local[(23)] = (compute_local[(23)] + (A_shared_local[(2)] * B_shared_local[(7)]));\n compute_local[(31)] = (compute_local[(31)] + (A_shared_local[(3)] * B_shared_local[(7)]));\n }\n }\n compute[((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)))] = compute_local[(0)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 2048))] = compute_local[(8)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 4096))] = compute_local[(16)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 6144))] = compute_local[(24)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 8))] = compute_local[(1)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 2056))] = compute_local[(9)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 4104))] = compute_local[(17)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 6152))] = compute_local[(25)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 16))] = compute_local[(2)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 2064))] = compute_local[(10)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 4112))] = compute_local[(18)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 6160))] = compute_local[(26)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 24))] = compute_local[(3)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 2072))] = compute_local[(11)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 4120))] = compute_local[(19)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 6168))] = compute_local[(27)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 32))] = compute_local[(4)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 2080))] = compute_local[(12)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 4128))] = compute_local[(20)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 6176))] = compute_local[(28)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 40))] = compute_local[(5)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 2088))] = compute_local[(13)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 4136))] = compute_local[(21)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 6184))] = compute_local[(29)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 48))] = compute_local[(6)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 2096))] = compute_local[(14)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 4144))] = compute_local[(22)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 6192))] = compute_local[(30)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 56))] = compute_local[(7)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 2104))] = compute_local[(15)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 4152))] = compute_local[(23)];\n compute[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (((int)threadIdx.x) & 7)) + 6200))] = compute_local[(31)];\n}\n", "gridDim": [8192, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_bert/roller_BatchMatMul_[128,16,512,64]_[128,16,512,64]_[128,16,512,512].json b/src/tools/nnfusion/kernel_db/roller_bert/roller_BatchMatMul_[128,16,512,64]_[128,16,512,64]_[128,16,512,512].json new file mode 100644 index 000000000..e4aec5924 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_bert/roller_BatchMatMul_[128,16,512,64]_[128,16,512,64]_[128,16,512,512].json @@ -0,0 +1 @@ +{"parameters": {"arg0_shape": [128, 16, 512, 64], "arg1_shape": [128, 16, 64, 512], "out_shape": [128, 16, 512, 512], "transpose_A": false, "transpose_B": false}, "op_type": "BatchMatMul", "tvm_func_name": "roller_BatchMatMul__128_16_512_64___128_16_512_64___128_16_512_512_", "code": "extern \"C\" __global__ void roller_BatchMatMul__128_16_512_64___128_16_512_64___128_16_512_512_(float* __restrict__ A, float* __restrict__ B, float* __restrict__ compute) {\n float compute_local[64];\n __shared__ float A_shared[4224];\n __shared__ float B_shared[1024];\n float A_shared_local[8];\n float B_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 8; ++k_outer) {\n __syncthreads();\n A_shared[((((((int)threadIdx.x) >> 3) * 33) + (((int)threadIdx.x) & 7)))] = A[((((((((int)blockIdx.x) >> 2) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (k_outer * 8)) + (((int)threadIdx.x) & 7)))];\n A_shared[(((((((int)threadIdx.x) >> 3) * 33) + (((int)threadIdx.x) & 7)) + 1056))] = A[(((((((((int)blockIdx.x) >> 2) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (k_outer * 8)) + (((int)threadIdx.x) & 7)) + 2048))];\n A_shared[(((((((int)threadIdx.x) >> 3) * 33) + (((int)threadIdx.x) & 7)) + 2112))] = A[(((((((((int)blockIdx.x) >> 2) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (k_outer * 8)) + (((int)threadIdx.x) & 7)) + 4096))];\n A_shared[(((((((int)threadIdx.x) >> 3) * 33) + (((int)threadIdx.x) & 7)) + 3168))] = A[(((((((((int)blockIdx.x) >> 2) * 8192) + ((((int)threadIdx.x) >> 3) * 64)) + (k_outer * 8)) + (((int)threadIdx.x) & 7)) + 6144))];\n B_shared[(((int)threadIdx.x))] = B[(((((((((int)blockIdx.x) >> 4) * 32768) + (k_outer * 4096)) + ((((int)threadIdx.x) >> 7) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 127)))];\n B_shared[((((int)threadIdx.x) + 256))] = B[((((((((((int)blockIdx.x) >> 4) * 32768) + (k_outer * 4096)) + ((((int)threadIdx.x) >> 7) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 127)) + 1024))];\n B_shared[((((int)threadIdx.x) + 512))] = B[((((((((((int)blockIdx.x) >> 4) * 32768) + (k_outer * 4096)) + ((((int)threadIdx.x) >> 7) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 127)) + 2048))];\n B_shared[((((int)threadIdx.x) + 768))] = B[((((((((((int)blockIdx.x) >> 4) * 32768) + (k_outer * 4096)) + ((((int)threadIdx.x) >> 7) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 127)) + 3072))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 8; ++k_inner_outer) {\n A_shared_local[(0)] = A_shared[((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer))];\n A_shared_local[(1)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 528))];\n A_shared_local[(2)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 1056))];\n A_shared_local[(3)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 1584))];\n A_shared_local[(4)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 2112))];\n A_shared_local[(5)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 2640))];\n A_shared_local[(6)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 3168))];\n A_shared_local[(7)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 3696))];\n B_shared_local[(0)] = B_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 15)))];\n B_shared_local[(1)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 16))];\n B_shared_local[(2)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 32))];\n B_shared_local[(3)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 48))];\n B_shared_local[(4)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 64))];\n B_shared_local[(5)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 80))];\n B_shared_local[(6)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 96))];\n B_shared_local[(7)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 112))];\n compute_local[(0)] = (compute_local[(0)] + (A_shared_local[(0)] * B_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (A_shared_local[(1)] * B_shared_local[(0)]));\n compute_local[(16)] = (compute_local[(16)] + (A_shared_local[(2)] * B_shared_local[(0)]));\n compute_local[(24)] = (compute_local[(24)] + (A_shared_local[(3)] * B_shared_local[(0)]));\n compute_local[(32)] = (compute_local[(32)] + (A_shared_local[(4)] * B_shared_local[(0)]));\n compute_local[(40)] = (compute_local[(40)] + (A_shared_local[(5)] * B_shared_local[(0)]));\n compute_local[(48)] = (compute_local[(48)] + (A_shared_local[(6)] * B_shared_local[(0)]));\n compute_local[(56)] = (compute_local[(56)] + (A_shared_local[(7)] * B_shared_local[(0)]));\n compute_local[(1)] = (compute_local[(1)] + (A_shared_local[(0)] * B_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (A_shared_local[(1)] * B_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (A_shared_local[(2)] * B_shared_local[(1)]));\n compute_local[(25)] = (compute_local[(25)] + (A_shared_local[(3)] * B_shared_local[(1)]));\n compute_local[(33)] = (compute_local[(33)] + (A_shared_local[(4)] * B_shared_local[(1)]));\n compute_local[(41)] = (compute_local[(41)] + (A_shared_local[(5)] * B_shared_local[(1)]));\n compute_local[(49)] = (compute_local[(49)] + (A_shared_local[(6)] * B_shared_local[(1)]));\n compute_local[(57)] = (compute_local[(57)] + (A_shared_local[(7)] * B_shared_local[(1)]));\n compute_local[(2)] = (compute_local[(2)] + (A_shared_local[(0)] * B_shared_local[(2)]));\n compute_local[(10)] = (compute_local[(10)] + (A_shared_local[(1)] * B_shared_local[(2)]));\n compute_local[(18)] = (compute_local[(18)] + (A_shared_local[(2)] * B_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (A_shared_local[(3)] * B_shared_local[(2)]));\n compute_local[(34)] = (compute_local[(34)] + (A_shared_local[(4)] * B_shared_local[(2)]));\n compute_local[(42)] = (compute_local[(42)] + (A_shared_local[(5)] * B_shared_local[(2)]));\n compute_local[(50)] = (compute_local[(50)] + (A_shared_local[(6)] * B_shared_local[(2)]));\n compute_local[(58)] = (compute_local[(58)] + (A_shared_local[(7)] * B_shared_local[(2)]));\n compute_local[(3)] = (compute_local[(3)] + (A_shared_local[(0)] * B_shared_local[(3)]));\n compute_local[(11)] = (compute_local[(11)] + (A_shared_local[(1)] * B_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (A_shared_local[(2)] * B_shared_local[(3)]));\n compute_local[(27)] = (compute_local[(27)] + (A_shared_local[(3)] * B_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (A_shared_local[(4)] * B_shared_local[(3)]));\n compute_local[(43)] = (compute_local[(43)] + (A_shared_local[(5)] * B_shared_local[(3)]));\n compute_local[(51)] = (compute_local[(51)] + (A_shared_local[(6)] * B_shared_local[(3)]));\n compute_local[(59)] = (compute_local[(59)] + (A_shared_local[(7)] * B_shared_local[(3)]));\n compute_local[(4)] = (compute_local[(4)] + (A_shared_local[(0)] * B_shared_local[(4)]));\n compute_local[(12)] = (compute_local[(12)] + (A_shared_local[(1)] * B_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (A_shared_local[(2)] * B_shared_local[(4)]));\n compute_local[(28)] = (compute_local[(28)] + (A_shared_local[(3)] * B_shared_local[(4)]));\n compute_local[(36)] = (compute_local[(36)] + (A_shared_local[(4)] * B_shared_local[(4)]));\n compute_local[(44)] = (compute_local[(44)] + (A_shared_local[(5)] * B_shared_local[(4)]));\n compute_local[(52)] = (compute_local[(52)] + (A_shared_local[(6)] * B_shared_local[(4)]));\n compute_local[(60)] = (compute_local[(60)] + (A_shared_local[(7)] * B_shared_local[(4)]));\n compute_local[(5)] = (compute_local[(5)] + (A_shared_local[(0)] * B_shared_local[(5)]));\n compute_local[(13)] = (compute_local[(13)] + (A_shared_local[(1)] * B_shared_local[(5)]));\n compute_local[(21)] = (compute_local[(21)] + (A_shared_local[(2)] * B_shared_local[(5)]));\n compute_local[(29)] = (compute_local[(29)] + (A_shared_local[(3)] * B_shared_local[(5)]));\n compute_local[(37)] = (compute_local[(37)] + (A_shared_local[(4)] * B_shared_local[(5)]));\n compute_local[(45)] = (compute_local[(45)] + (A_shared_local[(5)] * B_shared_local[(5)]));\n compute_local[(53)] = (compute_local[(53)] + (A_shared_local[(6)] * B_shared_local[(5)]));\n compute_local[(61)] = (compute_local[(61)] + (A_shared_local[(7)] * B_shared_local[(5)]));\n compute_local[(6)] = (compute_local[(6)] + (A_shared_local[(0)] * B_shared_local[(6)]));\n compute_local[(14)] = (compute_local[(14)] + (A_shared_local[(1)] * B_shared_local[(6)]));\n compute_local[(22)] = (compute_local[(22)] + (A_shared_local[(2)] * B_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (A_shared_local[(3)] * B_shared_local[(6)]));\n compute_local[(38)] = (compute_local[(38)] + (A_shared_local[(4)] * B_shared_local[(6)]));\n compute_local[(46)] = (compute_local[(46)] + (A_shared_local[(5)] * B_shared_local[(6)]));\n compute_local[(54)] = (compute_local[(54)] + (A_shared_local[(6)] * B_shared_local[(6)]));\n compute_local[(62)] = (compute_local[(62)] + (A_shared_local[(7)] * B_shared_local[(6)]));\n compute_local[(7)] = (compute_local[(7)] + (A_shared_local[(0)] * B_shared_local[(7)]));\n compute_local[(15)] = (compute_local[(15)] + (A_shared_local[(1)] * B_shared_local[(7)]));\n compute_local[(23)] = (compute_local[(23)] + (A_shared_local[(2)] * B_shared_local[(7)]));\n compute_local[(31)] = (compute_local[(31)] + (A_shared_local[(3)] * B_shared_local[(7)]));\n compute_local[(39)] = (compute_local[(39)] + (A_shared_local[(4)] * B_shared_local[(7)]));\n compute_local[(47)] = (compute_local[(47)] + (A_shared_local[(5)] * B_shared_local[(7)]));\n compute_local[(55)] = (compute_local[(55)] + (A_shared_local[(6)] * B_shared_local[(7)]));\n compute_local[(63)] = (compute_local[(63)] + (A_shared_local[(7)] * B_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)))] = compute_local[(0)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 8192))] = compute_local[(8)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 16384))] = compute_local[(16)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 24576))] = compute_local[(24)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 32768))] = compute_local[(32)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 40960))] = compute_local[(40)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 49152))] = compute_local[(48)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 57344))] = compute_local[(56)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 16))] = compute_local[(1)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 8208))] = compute_local[(9)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 16400))] = compute_local[(17)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 24592))] = compute_local[(25)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 32784))] = compute_local[(33)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 40976))] = compute_local[(41)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 49168))] = compute_local[(49)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 57360))] = compute_local[(57)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 32))] = compute_local[(2)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 8224))] = compute_local[(10)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 16416))] = compute_local[(18)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 24608))] = compute_local[(26)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 32800))] = compute_local[(34)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 40992))] = compute_local[(42)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 49184))] = compute_local[(50)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 57376))] = compute_local[(58)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 48))] = compute_local[(3)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 8240))] = compute_local[(11)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 16432))] = compute_local[(19)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 24624))] = compute_local[(27)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 32816))] = compute_local[(35)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 41008))] = compute_local[(43)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 49200))] = compute_local[(51)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 57392))] = compute_local[(59)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 64))] = compute_local[(4)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 8256))] = compute_local[(12)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 16448))] = compute_local[(20)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 24640))] = compute_local[(28)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 32832))] = compute_local[(36)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 41024))] = compute_local[(44)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 49216))] = compute_local[(52)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 57408))] = compute_local[(60)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 80))] = compute_local[(5)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 8272))] = compute_local[(13)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 16464))] = compute_local[(21)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 24656))] = compute_local[(29)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 32848))] = compute_local[(37)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 41040))] = compute_local[(45)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 49232))] = compute_local[(53)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 57424))] = compute_local[(61)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 96))] = compute_local[(6)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 8288))] = compute_local[(14)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 16480))] = compute_local[(22)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 24672))] = compute_local[(30)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 32864))] = compute_local[(38)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 41056))] = compute_local[(46)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 49248))] = compute_local[(54)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 57440))] = compute_local[(62)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 112))] = compute_local[(7)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 8304))] = compute_local[(15)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 16496))] = compute_local[(23)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 24688))] = compute_local[(31)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 32880))] = compute_local[(39)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 41072))] = compute_local[(47)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 49264))] = compute_local[(55)];\n compute[(((((((((int)blockIdx.x) >> 2) * 65536) + ((((int)threadIdx.x) >> 4) * 512)) + ((((int)blockIdx.x) & 3) * 128)) + (((int)threadIdx.x) & 15)) + 57456))] = compute_local[(63)];\n}\n", "gridDim": [32768, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_bert/roller_Broadcast_[128,512,512]_[128,16,512,512].json b/src/tools/nnfusion/kernel_db/roller_bert/roller_Broadcast_[128,512,512]_[128,16,512,512].json new file mode 100644 index 000000000..e9264cd59 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_bert/roller_Broadcast_[128,512,512]_[128,16,512,512].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 512, 512], "output_shape": [128, 16, 512, 512], "broadcast_axis": [1]}, "op_type": "Broadcast", "tvm_func_name": "roller_Broadcast__128_512_512___128_16_512_512_", "code": "extern \"C\" __global__ void roller_Broadcast__128_512_512___128_16_512_512_(float* __restrict__ compute, float* __restrict__ A) {\n compute[(((((int)blockIdx.x) * 512) + ((int)threadIdx.x)))] = A[(((((((int)blockIdx.x) >> 13) * 262144) + ((((int)blockIdx.x) & 511) * 512)) + ((int)threadIdx.x)))];\n}\n", "gridDim": [1048576, 1, 1], "blockDim": [512, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_bert/roller_Broadcast_[128,512]_[128,512,512].json b/src/tools/nnfusion/kernel_db/roller_bert/roller_Broadcast_[128,512]_[128,512,512].json new file mode 100644 index 000000000..e547032e6 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_bert/roller_Broadcast_[128,512]_[128,512,512].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 512], "output_shape": [128, 512, 512], "broadcast_axis": [1]}, "op_type": "Broadcast", "tvm_func_name": "roller_Broadcast__128_512___128_512_512_", "code": "extern \"C\" __global__ void roller_Broadcast__128_512___128_512_512_(float* __restrict__ compute, float* __restrict__ A) {\n compute[((((((((((int)blockIdx.x) >> 12) * 2097152) + ((((int)threadIdx.x) >> 6) * 262144)) + (((((int)blockIdx.x) & 4095) >> 5) * 2048)) + (((((int)threadIdx.x) & 63) >> 4) * 512)) + ((((int)blockIdx.x) & 31) * 16)) + (((int)threadIdx.x) & 15)))] = A[((((((((int)blockIdx.x) >> 12) * 4096) + ((((int)threadIdx.x) >> 6) * 512)) + ((((int)blockIdx.x) & 31) * 16)) + (((int)threadIdx.x) & 15)))];\n}\n", "gridDim": [65536, 1, 1], "blockDim": [512, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,1024]_[1024,1024]_[65536,1024].json b/src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,1024]_[1024,1024]_[65536,1024].json new file mode 100644 index 000000000..65b5afe61 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,1024]_[1024,1024]_[65536,1024].json @@ -0,0 +1 @@ +{"parameters": {"arg0_shape": [65536, 1024], "arg1_shape": [1024, 1024], "out_shape": [65536, 1024], "transpose_A": false, "transpose_B": false}, "op_type": "Dot", "tvm_func_name": "roller_Dot__65536_1024___1024_1024___65536_1024_", "code": "extern \"C\" __global__ void roller_Dot__65536_1024___1024_1024___65536_1024_(float* __restrict__ A, float* __restrict__ B, float* __restrict__ compute) {\n float compute_local[64];\n __shared__ float A_shared[4224];\n __shared__ float B_shared[4096];\n float A_shared_local[8];\n float B_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 32; ++k_outer) {\n __syncthreads();\n A_shared[((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)))] = A[((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 264))] = A[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 528))] = A[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 792))] = A[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24576))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1056))] = A[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32768))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1320))] = A[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 40960))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1584))] = A[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 49152))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1848))] = A[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 57344))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2112))] = A[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 65536))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2376))] = A[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 73728))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2640))] = A[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 81920))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2904))] = A[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 90112))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3168))] = A[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 98304))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3432))] = A[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 106496))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3696))] = A[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 114688))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3960))] = A[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 122880))];\n B_shared[(((int)threadIdx.x))] = B[(((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)))];\n B_shared[((((int)threadIdx.x) + 256))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 2048))];\n B_shared[((((int)threadIdx.x) + 512))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 4096))];\n B_shared[((((int)threadIdx.x) + 768))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 6144))];\n B_shared[((((int)threadIdx.x) + 1024))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 8192))];\n B_shared[((((int)threadIdx.x) + 1280))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 10240))];\n B_shared[((((int)threadIdx.x) + 1536))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 12288))];\n B_shared[((((int)threadIdx.x) + 1792))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 14336))];\n B_shared[((((int)threadIdx.x) + 2048))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 16384))];\n B_shared[((((int)threadIdx.x) + 2304))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 18432))];\n B_shared[((((int)threadIdx.x) + 2560))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 20480))];\n B_shared[((((int)threadIdx.x) + 2816))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 22528))];\n B_shared[((((int)threadIdx.x) + 3072))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 24576))];\n B_shared[((((int)threadIdx.x) + 3328))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 26624))];\n B_shared[((((int)threadIdx.x) + 3584))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 28672))];\n B_shared[((((int)threadIdx.x) + 3840))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 30720))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n A_shared_local[(0)] = A_shared[((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer))];\n A_shared_local[(1)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 528))];\n A_shared_local[(2)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 1056))];\n A_shared_local[(3)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 1584))];\n A_shared_local[(4)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 2112))];\n A_shared_local[(5)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 2640))];\n A_shared_local[(6)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 3168))];\n A_shared_local[(7)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 3696))];\n B_shared_local[(0)] = B_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 15)))];\n B_shared_local[(1)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 16))];\n B_shared_local[(2)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 32))];\n B_shared_local[(3)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 48))];\n B_shared_local[(4)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 64))];\n B_shared_local[(5)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 80))];\n B_shared_local[(6)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 96))];\n B_shared_local[(7)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 112))];\n compute_local[(0)] = (compute_local[(0)] + (A_shared_local[(0)] * B_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (A_shared_local[(1)] * B_shared_local[(0)]));\n compute_local[(16)] = (compute_local[(16)] + (A_shared_local[(2)] * B_shared_local[(0)]));\n compute_local[(24)] = (compute_local[(24)] + (A_shared_local[(3)] * B_shared_local[(0)]));\n compute_local[(32)] = (compute_local[(32)] + (A_shared_local[(4)] * B_shared_local[(0)]));\n compute_local[(40)] = (compute_local[(40)] + (A_shared_local[(5)] * B_shared_local[(0)]));\n compute_local[(48)] = (compute_local[(48)] + (A_shared_local[(6)] * B_shared_local[(0)]));\n compute_local[(56)] = (compute_local[(56)] + (A_shared_local[(7)] * B_shared_local[(0)]));\n compute_local[(1)] = (compute_local[(1)] + (A_shared_local[(0)] * B_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (A_shared_local[(1)] * B_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (A_shared_local[(2)] * B_shared_local[(1)]));\n compute_local[(25)] = (compute_local[(25)] + (A_shared_local[(3)] * B_shared_local[(1)]));\n compute_local[(33)] = (compute_local[(33)] + (A_shared_local[(4)] * B_shared_local[(1)]));\n compute_local[(41)] = (compute_local[(41)] + (A_shared_local[(5)] * B_shared_local[(1)]));\n compute_local[(49)] = (compute_local[(49)] + (A_shared_local[(6)] * B_shared_local[(1)]));\n compute_local[(57)] = (compute_local[(57)] + (A_shared_local[(7)] * B_shared_local[(1)]));\n compute_local[(2)] = (compute_local[(2)] + (A_shared_local[(0)] * B_shared_local[(2)]));\n compute_local[(10)] = (compute_local[(10)] + (A_shared_local[(1)] * B_shared_local[(2)]));\n compute_local[(18)] = (compute_local[(18)] + (A_shared_local[(2)] * B_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (A_shared_local[(3)] * B_shared_local[(2)]));\n compute_local[(34)] = (compute_local[(34)] + (A_shared_local[(4)] * B_shared_local[(2)]));\n compute_local[(42)] = (compute_local[(42)] + (A_shared_local[(5)] * B_shared_local[(2)]));\n compute_local[(50)] = (compute_local[(50)] + (A_shared_local[(6)] * B_shared_local[(2)]));\n compute_local[(58)] = (compute_local[(58)] + (A_shared_local[(7)] * B_shared_local[(2)]));\n compute_local[(3)] = (compute_local[(3)] + (A_shared_local[(0)] * B_shared_local[(3)]));\n compute_local[(11)] = (compute_local[(11)] + (A_shared_local[(1)] * B_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (A_shared_local[(2)] * B_shared_local[(3)]));\n compute_local[(27)] = (compute_local[(27)] + (A_shared_local[(3)] * B_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (A_shared_local[(4)] * B_shared_local[(3)]));\n compute_local[(43)] = (compute_local[(43)] + (A_shared_local[(5)] * B_shared_local[(3)]));\n compute_local[(51)] = (compute_local[(51)] + (A_shared_local[(6)] * B_shared_local[(3)]));\n compute_local[(59)] = (compute_local[(59)] + (A_shared_local[(7)] * B_shared_local[(3)]));\n compute_local[(4)] = (compute_local[(4)] + (A_shared_local[(0)] * B_shared_local[(4)]));\n compute_local[(12)] = (compute_local[(12)] + (A_shared_local[(1)] * B_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (A_shared_local[(2)] * B_shared_local[(4)]));\n compute_local[(28)] = (compute_local[(28)] + (A_shared_local[(3)] * B_shared_local[(4)]));\n compute_local[(36)] = (compute_local[(36)] + (A_shared_local[(4)] * B_shared_local[(4)]));\n compute_local[(44)] = (compute_local[(44)] + (A_shared_local[(5)] * B_shared_local[(4)]));\n compute_local[(52)] = (compute_local[(52)] + (A_shared_local[(6)] * B_shared_local[(4)]));\n compute_local[(60)] = (compute_local[(60)] + (A_shared_local[(7)] * B_shared_local[(4)]));\n compute_local[(5)] = (compute_local[(5)] + (A_shared_local[(0)] * B_shared_local[(5)]));\n compute_local[(13)] = (compute_local[(13)] + (A_shared_local[(1)] * B_shared_local[(5)]));\n compute_local[(21)] = (compute_local[(21)] + (A_shared_local[(2)] * B_shared_local[(5)]));\n compute_local[(29)] = (compute_local[(29)] + (A_shared_local[(3)] * B_shared_local[(5)]));\n compute_local[(37)] = (compute_local[(37)] + (A_shared_local[(4)] * B_shared_local[(5)]));\n compute_local[(45)] = (compute_local[(45)] + (A_shared_local[(5)] * B_shared_local[(5)]));\n compute_local[(53)] = (compute_local[(53)] + (A_shared_local[(6)] * B_shared_local[(5)]));\n compute_local[(61)] = (compute_local[(61)] + (A_shared_local[(7)] * B_shared_local[(5)]));\n compute_local[(6)] = (compute_local[(6)] + (A_shared_local[(0)] * B_shared_local[(6)]));\n compute_local[(14)] = (compute_local[(14)] + (A_shared_local[(1)] * B_shared_local[(6)]));\n compute_local[(22)] = (compute_local[(22)] + (A_shared_local[(2)] * B_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (A_shared_local[(3)] * B_shared_local[(6)]));\n compute_local[(38)] = (compute_local[(38)] + (A_shared_local[(4)] * B_shared_local[(6)]));\n compute_local[(46)] = (compute_local[(46)] + (A_shared_local[(5)] * B_shared_local[(6)]));\n compute_local[(54)] = (compute_local[(54)] + (A_shared_local[(6)] * B_shared_local[(6)]));\n compute_local[(62)] = (compute_local[(62)] + (A_shared_local[(7)] * B_shared_local[(6)]));\n compute_local[(7)] = (compute_local[(7)] + (A_shared_local[(0)] * B_shared_local[(7)]));\n compute_local[(15)] = (compute_local[(15)] + (A_shared_local[(1)] * B_shared_local[(7)]));\n compute_local[(23)] = (compute_local[(23)] + (A_shared_local[(2)] * B_shared_local[(7)]));\n compute_local[(31)] = (compute_local[(31)] + (A_shared_local[(3)] * B_shared_local[(7)]));\n compute_local[(39)] = (compute_local[(39)] + (A_shared_local[(4)] * B_shared_local[(7)]));\n compute_local[(47)] = (compute_local[(47)] + (A_shared_local[(5)] * B_shared_local[(7)]));\n compute_local[(55)] = (compute_local[(55)] + (A_shared_local[(6)] * B_shared_local[(7)]));\n compute_local[(63)] = (compute_local[(63)] + (A_shared_local[(7)] * B_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)))] = compute_local[(0)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16384))] = compute_local[(8)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32768))] = compute_local[(16)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 49152))] = compute_local[(24)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 65536))] = compute_local[(32)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 81920))] = compute_local[(40)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 98304))] = compute_local[(48)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 114688))] = compute_local[(56)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16))] = compute_local[(1)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16400))] = compute_local[(9)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32784))] = compute_local[(17)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 49168))] = compute_local[(25)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 65552))] = compute_local[(33)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 81936))] = compute_local[(41)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 98320))] = compute_local[(49)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 114704))] = compute_local[(57)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32))] = compute_local[(2)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16416))] = compute_local[(10)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32800))] = compute_local[(18)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 49184))] = compute_local[(26)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 65568))] = compute_local[(34)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 81952))] = compute_local[(42)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 98336))] = compute_local[(50)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 114720))] = compute_local[(58)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 48))] = compute_local[(3)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16432))] = compute_local[(11)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32816))] = compute_local[(19)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 49200))] = compute_local[(27)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 65584))] = compute_local[(35)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 81968))] = compute_local[(43)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 98352))] = compute_local[(51)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 114736))] = compute_local[(59)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 64))] = compute_local[(4)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16448))] = compute_local[(12)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32832))] = compute_local[(20)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 49216))] = compute_local[(28)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 65600))] = compute_local[(36)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 81984))] = compute_local[(44)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 98368))] = compute_local[(52)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 114752))] = compute_local[(60)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 80))] = compute_local[(5)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16464))] = compute_local[(13)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32848))] = compute_local[(21)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 49232))] = compute_local[(29)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 65616))] = compute_local[(37)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 82000))] = compute_local[(45)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 98384))] = compute_local[(53)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 114768))] = compute_local[(61)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 96))] = compute_local[(6)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16480))] = compute_local[(14)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32864))] = compute_local[(22)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 49248))] = compute_local[(30)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 65632))] = compute_local[(38)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 82016))] = compute_local[(46)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 98400))] = compute_local[(54)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 114784))] = compute_local[(62)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 112))] = compute_local[(7)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16496))] = compute_local[(15)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32880))] = compute_local[(23)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 49264))] = compute_local[(31)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 65648))] = compute_local[(39)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 82032))] = compute_local[(47)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 98416))] = compute_local[(55)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 114800))] = compute_local[(63)];\n}\n", "gridDim": [4096, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,1024]_[1024,4096]_[65536,1024].json b/src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,1024]_[1024,4096]_[65536,1024].json new file mode 100644 index 000000000..f72359eec --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,1024]_[1024,4096]_[65536,1024].json @@ -0,0 +1 @@ +{"parameters": {"arg0_shape": [65536, 1024], "arg1_shape": [1024, 4096], "out_shape": [65536, 4096], "transpose_A": false, "transpose_B": false}, "op_type": "Dot", "tvm_func_name": "roller_Dot__65536_1024___1024_4096___65536_4096_", "code": "extern \"C\" __global__ void roller_Dot__65536_1024___1024_4096___65536_4096_(float* __restrict__ A, float* __restrict__ B, float* __restrict__ compute) {\n float compute_local[64];\n __shared__ float A_shared[4224];\n __shared__ float B_shared[4096];\n float A_shared_local[8];\n float B_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 32; ++k_outer) {\n __syncthreads();\n A_shared[((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)))] = A[((((((((int)blockIdx.x) >> 5) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 264))] = A[(((((((((int)blockIdx.x) >> 5) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 528))] = A[(((((((((int)blockIdx.x) >> 5) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 792))] = A[(((((((((int)blockIdx.x) >> 5) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24576))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1056))] = A[(((((((((int)blockIdx.x) >> 5) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32768))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1320))] = A[(((((((((int)blockIdx.x) >> 5) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 40960))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1584))] = A[(((((((((int)blockIdx.x) >> 5) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 49152))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1848))] = A[(((((((((int)blockIdx.x) >> 5) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 57344))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2112))] = A[(((((((((int)blockIdx.x) >> 5) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 65536))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2376))] = A[(((((((((int)blockIdx.x) >> 5) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 73728))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2640))] = A[(((((((((int)blockIdx.x) >> 5) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 81920))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2904))] = A[(((((((((int)blockIdx.x) >> 5) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 90112))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3168))] = A[(((((((((int)blockIdx.x) >> 5) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 98304))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3432))] = A[(((((((((int)blockIdx.x) >> 5) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 106496))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3696))] = A[(((((((((int)blockIdx.x) >> 5) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 114688))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3960))] = A[(((((((((int)blockIdx.x) >> 5) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 122880))];\n B_shared[(((int)threadIdx.x))] = B[(((((k_outer * 131072) + ((((int)threadIdx.x) >> 7) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 127)))];\n B_shared[((((int)threadIdx.x) + 256))] = B[((((((k_outer * 131072) + ((((int)threadIdx.x) >> 7) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 127)) + 8192))];\n B_shared[((((int)threadIdx.x) + 512))] = B[((((((k_outer * 131072) + ((((int)threadIdx.x) >> 7) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 127)) + 16384))];\n B_shared[((((int)threadIdx.x) + 768))] = B[((((((k_outer * 131072) + ((((int)threadIdx.x) >> 7) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 127)) + 24576))];\n B_shared[((((int)threadIdx.x) + 1024))] = B[((((((k_outer * 131072) + ((((int)threadIdx.x) >> 7) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 127)) + 32768))];\n B_shared[((((int)threadIdx.x) + 1280))] = B[((((((k_outer * 131072) + ((((int)threadIdx.x) >> 7) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 127)) + 40960))];\n B_shared[((((int)threadIdx.x) + 1536))] = B[((((((k_outer * 131072) + ((((int)threadIdx.x) >> 7) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 127)) + 49152))];\n B_shared[((((int)threadIdx.x) + 1792))] = B[((((((k_outer * 131072) + ((((int)threadIdx.x) >> 7) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 127)) + 57344))];\n B_shared[((((int)threadIdx.x) + 2048))] = B[((((((k_outer * 131072) + ((((int)threadIdx.x) >> 7) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 127)) + 65536))];\n B_shared[((((int)threadIdx.x) + 2304))] = B[((((((k_outer * 131072) + ((((int)threadIdx.x) >> 7) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 127)) + 73728))];\n B_shared[((((int)threadIdx.x) + 2560))] = B[((((((k_outer * 131072) + ((((int)threadIdx.x) >> 7) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 127)) + 81920))];\n B_shared[((((int)threadIdx.x) + 2816))] = B[((((((k_outer * 131072) + ((((int)threadIdx.x) >> 7) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 127)) + 90112))];\n B_shared[((((int)threadIdx.x) + 3072))] = B[((((((k_outer * 131072) + ((((int)threadIdx.x) >> 7) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 127)) + 98304))];\n B_shared[((((int)threadIdx.x) + 3328))] = B[((((((k_outer * 131072) + ((((int)threadIdx.x) >> 7) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 127)) + 106496))];\n B_shared[((((int)threadIdx.x) + 3584))] = B[((((((k_outer * 131072) + ((((int)threadIdx.x) >> 7) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 127)) + 114688))];\n B_shared[((((int)threadIdx.x) + 3840))] = B[((((((k_outer * 131072) + ((((int)threadIdx.x) >> 7) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 127)) + 122880))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n A_shared_local[(0)] = A_shared[((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer))];\n A_shared_local[(1)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 528))];\n A_shared_local[(2)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 1056))];\n A_shared_local[(3)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 1584))];\n A_shared_local[(4)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 2112))];\n A_shared_local[(5)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 2640))];\n A_shared_local[(6)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 3168))];\n A_shared_local[(7)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 3696))];\n B_shared_local[(0)] = B_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 15)))];\n B_shared_local[(1)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 16))];\n B_shared_local[(2)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 32))];\n B_shared_local[(3)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 48))];\n B_shared_local[(4)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 64))];\n B_shared_local[(5)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 80))];\n B_shared_local[(6)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 96))];\n B_shared_local[(7)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 112))];\n compute_local[(0)] = (compute_local[(0)] + (A_shared_local[(0)] * B_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (A_shared_local[(1)] * B_shared_local[(0)]));\n compute_local[(16)] = (compute_local[(16)] + (A_shared_local[(2)] * B_shared_local[(0)]));\n compute_local[(24)] = (compute_local[(24)] + (A_shared_local[(3)] * B_shared_local[(0)]));\n compute_local[(32)] = (compute_local[(32)] + (A_shared_local[(4)] * B_shared_local[(0)]));\n compute_local[(40)] = (compute_local[(40)] + (A_shared_local[(5)] * B_shared_local[(0)]));\n compute_local[(48)] = (compute_local[(48)] + (A_shared_local[(6)] * B_shared_local[(0)]));\n compute_local[(56)] = (compute_local[(56)] + (A_shared_local[(7)] * B_shared_local[(0)]));\n compute_local[(1)] = (compute_local[(1)] + (A_shared_local[(0)] * B_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (A_shared_local[(1)] * B_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (A_shared_local[(2)] * B_shared_local[(1)]));\n compute_local[(25)] = (compute_local[(25)] + (A_shared_local[(3)] * B_shared_local[(1)]));\n compute_local[(33)] = (compute_local[(33)] + (A_shared_local[(4)] * B_shared_local[(1)]));\n compute_local[(41)] = (compute_local[(41)] + (A_shared_local[(5)] * B_shared_local[(1)]));\n compute_local[(49)] = (compute_local[(49)] + (A_shared_local[(6)] * B_shared_local[(1)]));\n compute_local[(57)] = (compute_local[(57)] + (A_shared_local[(7)] * B_shared_local[(1)]));\n compute_local[(2)] = (compute_local[(2)] + (A_shared_local[(0)] * B_shared_local[(2)]));\n compute_local[(10)] = (compute_local[(10)] + (A_shared_local[(1)] * B_shared_local[(2)]));\n compute_local[(18)] = (compute_local[(18)] + (A_shared_local[(2)] * B_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (A_shared_local[(3)] * B_shared_local[(2)]));\n compute_local[(34)] = (compute_local[(34)] + (A_shared_local[(4)] * B_shared_local[(2)]));\n compute_local[(42)] = (compute_local[(42)] + (A_shared_local[(5)] * B_shared_local[(2)]));\n compute_local[(50)] = (compute_local[(50)] + (A_shared_local[(6)] * B_shared_local[(2)]));\n compute_local[(58)] = (compute_local[(58)] + (A_shared_local[(7)] * B_shared_local[(2)]));\n compute_local[(3)] = (compute_local[(3)] + (A_shared_local[(0)] * B_shared_local[(3)]));\n compute_local[(11)] = (compute_local[(11)] + (A_shared_local[(1)] * B_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (A_shared_local[(2)] * B_shared_local[(3)]));\n compute_local[(27)] = (compute_local[(27)] + (A_shared_local[(3)] * B_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (A_shared_local[(4)] * B_shared_local[(3)]));\n compute_local[(43)] = (compute_local[(43)] + (A_shared_local[(5)] * B_shared_local[(3)]));\n compute_local[(51)] = (compute_local[(51)] + (A_shared_local[(6)] * B_shared_local[(3)]));\n compute_local[(59)] = (compute_local[(59)] + (A_shared_local[(7)] * B_shared_local[(3)]));\n compute_local[(4)] = (compute_local[(4)] + (A_shared_local[(0)] * B_shared_local[(4)]));\n compute_local[(12)] = (compute_local[(12)] + (A_shared_local[(1)] * B_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (A_shared_local[(2)] * B_shared_local[(4)]));\n compute_local[(28)] = (compute_local[(28)] + (A_shared_local[(3)] * B_shared_local[(4)]));\n compute_local[(36)] = (compute_local[(36)] + (A_shared_local[(4)] * B_shared_local[(4)]));\n compute_local[(44)] = (compute_local[(44)] + (A_shared_local[(5)] * B_shared_local[(4)]));\n compute_local[(52)] = (compute_local[(52)] + (A_shared_local[(6)] * B_shared_local[(4)]));\n compute_local[(60)] = (compute_local[(60)] + (A_shared_local[(7)] * B_shared_local[(4)]));\n compute_local[(5)] = (compute_local[(5)] + (A_shared_local[(0)] * B_shared_local[(5)]));\n compute_local[(13)] = (compute_local[(13)] + (A_shared_local[(1)] * B_shared_local[(5)]));\n compute_local[(21)] = (compute_local[(21)] + (A_shared_local[(2)] * B_shared_local[(5)]));\n compute_local[(29)] = (compute_local[(29)] + (A_shared_local[(3)] * B_shared_local[(5)]));\n compute_local[(37)] = (compute_local[(37)] + (A_shared_local[(4)] * B_shared_local[(5)]));\n compute_local[(45)] = (compute_local[(45)] + (A_shared_local[(5)] * B_shared_local[(5)]));\n compute_local[(53)] = (compute_local[(53)] + (A_shared_local[(6)] * B_shared_local[(5)]));\n compute_local[(61)] = (compute_local[(61)] + (A_shared_local[(7)] * B_shared_local[(5)]));\n compute_local[(6)] = (compute_local[(6)] + (A_shared_local[(0)] * B_shared_local[(6)]));\n compute_local[(14)] = (compute_local[(14)] + (A_shared_local[(1)] * B_shared_local[(6)]));\n compute_local[(22)] = (compute_local[(22)] + (A_shared_local[(2)] * B_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (A_shared_local[(3)] * B_shared_local[(6)]));\n compute_local[(38)] = (compute_local[(38)] + (A_shared_local[(4)] * B_shared_local[(6)]));\n compute_local[(46)] = (compute_local[(46)] + (A_shared_local[(5)] * B_shared_local[(6)]));\n compute_local[(54)] = (compute_local[(54)] + (A_shared_local[(6)] * B_shared_local[(6)]));\n compute_local[(62)] = (compute_local[(62)] + (A_shared_local[(7)] * B_shared_local[(6)]));\n compute_local[(7)] = (compute_local[(7)] + (A_shared_local[(0)] * B_shared_local[(7)]));\n compute_local[(15)] = (compute_local[(15)] + (A_shared_local[(1)] * B_shared_local[(7)]));\n compute_local[(23)] = (compute_local[(23)] + (A_shared_local[(2)] * B_shared_local[(7)]));\n compute_local[(31)] = (compute_local[(31)] + (A_shared_local[(3)] * B_shared_local[(7)]));\n compute_local[(39)] = (compute_local[(39)] + (A_shared_local[(4)] * B_shared_local[(7)]));\n compute_local[(47)] = (compute_local[(47)] + (A_shared_local[(5)] * B_shared_local[(7)]));\n compute_local[(55)] = (compute_local[(55)] + (A_shared_local[(6)] * B_shared_local[(7)]));\n compute_local[(63)] = (compute_local[(63)] + (A_shared_local[(7)] * B_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)))] = compute_local[(0)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 65536))] = compute_local[(8)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 131072))] = compute_local[(16)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 196608))] = compute_local[(24)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 262144))] = compute_local[(32)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 327680))] = compute_local[(40)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 393216))] = compute_local[(48)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 458752))] = compute_local[(56)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 16))] = compute_local[(1)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 65552))] = compute_local[(9)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 131088))] = compute_local[(17)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 196624))] = compute_local[(25)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 262160))] = compute_local[(33)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 327696))] = compute_local[(41)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 393232))] = compute_local[(49)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 458768))] = compute_local[(57)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 32))] = compute_local[(2)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 65568))] = compute_local[(10)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 131104))] = compute_local[(18)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 196640))] = compute_local[(26)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 262176))] = compute_local[(34)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 327712))] = compute_local[(42)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 393248))] = compute_local[(50)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 458784))] = compute_local[(58)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 48))] = compute_local[(3)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 65584))] = compute_local[(11)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 131120))] = compute_local[(19)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 196656))] = compute_local[(27)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 262192))] = compute_local[(35)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 327728))] = compute_local[(43)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 393264))] = compute_local[(51)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 458800))] = compute_local[(59)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 64))] = compute_local[(4)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 65600))] = compute_local[(12)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 131136))] = compute_local[(20)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 196672))] = compute_local[(28)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 262208))] = compute_local[(36)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 327744))] = compute_local[(44)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 393280))] = compute_local[(52)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 458816))] = compute_local[(60)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 80))] = compute_local[(5)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 65616))] = compute_local[(13)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 131152))] = compute_local[(21)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 196688))] = compute_local[(29)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 262224))] = compute_local[(37)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 327760))] = compute_local[(45)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 393296))] = compute_local[(53)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 458832))] = compute_local[(61)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 96))] = compute_local[(6)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 65632))] = compute_local[(14)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 131168))] = compute_local[(22)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 196704))] = compute_local[(30)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 262240))] = compute_local[(38)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 327776))] = compute_local[(46)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 393312))] = compute_local[(54)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 458848))] = compute_local[(62)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 112))] = compute_local[(7)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 65648))] = compute_local[(15)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 131184))] = compute_local[(23)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 196720))] = compute_local[(31)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 262256))] = compute_local[(39)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 327792))] = compute_local[(47)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 393328))] = compute_local[(55)];\n compute[(((((((((int)blockIdx.x) >> 5) * 524288) + ((((int)threadIdx.x) >> 4) * 4096)) + ((((int)blockIdx.x) & 31) * 128)) + (((int)threadIdx.x) & 15)) + 458864))] = compute_local[(63)];\n}\n", "gridDim": [16384, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,2]_[2,1024]_[65536,1024].json b/src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,2]_[2,1024]_[65536,1024].json new file mode 100644 index 000000000..05949e667 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,2]_[2,1024]_[65536,1024].json @@ -0,0 +1 @@ +{"parameters": {"arg0_shape": [65536, 2], "arg1_shape": [2, 1024], "out_shape": [65536, 1024], "transpose_A": false, "transpose_B": false}, "op_type": "Dot", "tvm_func_name": "roller_Dot__65536_2___2_1024___65536_1024_", "code": "extern \"C\" __global__ void roller_Dot__65536_2___2_1024___65536_1024_(float* __restrict__ A, float* __restrict__ B, float* __restrict__ compute) {\n float compute_local[32];\n __shared__ float A_shared[8448];\n __shared__ float B_shared[128];\n float A_shared_local[16];\n float B_shared_local[2];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n A_shared[((((((int)threadIdx.x) >> 1) * 33) + (((int)threadIdx.x) & 1)))] = A[((((((int)blockIdx.x) >> 4) * 512) + ((int)threadIdx.x)))];\n if (((int)threadIdx.x) < 128) {\n B_shared[(((int)threadIdx.x))] = B[(((((((int)threadIdx.x) >> 6) * 1024) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 63)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 2; ++k_inner_outer) {\n A_shared_local[(0)] = A_shared[((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer))];\n A_shared_local[(1)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 528))];\n A_shared_local[(2)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 1056))];\n A_shared_local[(3)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 1584))];\n A_shared_local[(4)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 2112))];\n A_shared_local[(5)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 2640))];\n A_shared_local[(6)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 3168))];\n A_shared_local[(7)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 3696))];\n A_shared_local[(8)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 4224))];\n A_shared_local[(9)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 4752))];\n A_shared_local[(10)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 5280))];\n A_shared_local[(11)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 5808))];\n A_shared_local[(12)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 6336))];\n A_shared_local[(13)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 6864))];\n A_shared_local[(14)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 7392))];\n A_shared_local[(15)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 7920))];\n B_shared_local[(0)] = B_shared[(((k_inner_outer * 64) + (((int)threadIdx.x) & 31)))];\n B_shared_local[(1)] = B_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 31)) + 32))];\n compute_local[(0)] = (compute_local[(0)] + (A_shared_local[(0)] * B_shared_local[(0)]));\n compute_local[(2)] = (compute_local[(2)] + (A_shared_local[(1)] * B_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (A_shared_local[(2)] * B_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (A_shared_local[(3)] * B_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (A_shared_local[(4)] * B_shared_local[(0)]));\n compute_local[(10)] = (compute_local[(10)] + (A_shared_local[(5)] * B_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (A_shared_local[(6)] * B_shared_local[(0)]));\n compute_local[(14)] = (compute_local[(14)] + (A_shared_local[(7)] * B_shared_local[(0)]));\n compute_local[(16)] = (compute_local[(16)] + (A_shared_local[(8)] * B_shared_local[(0)]));\n compute_local[(18)] = (compute_local[(18)] + (A_shared_local[(9)] * B_shared_local[(0)]));\n compute_local[(20)] = (compute_local[(20)] + (A_shared_local[(10)] * B_shared_local[(0)]));\n compute_local[(22)] = (compute_local[(22)] + (A_shared_local[(11)] * B_shared_local[(0)]));\n compute_local[(24)] = (compute_local[(24)] + (A_shared_local[(12)] * B_shared_local[(0)]));\n compute_local[(26)] = (compute_local[(26)] + (A_shared_local[(13)] * B_shared_local[(0)]));\n compute_local[(28)] = (compute_local[(28)] + (A_shared_local[(14)] * B_shared_local[(0)]));\n compute_local[(30)] = (compute_local[(30)] + (A_shared_local[(15)] * B_shared_local[(0)]));\n compute_local[(1)] = (compute_local[(1)] + (A_shared_local[(0)] * B_shared_local[(1)]));\n compute_local[(3)] = (compute_local[(3)] + (A_shared_local[(1)] * B_shared_local[(1)]));\n compute_local[(5)] = (compute_local[(5)] + (A_shared_local[(2)] * B_shared_local[(1)]));\n compute_local[(7)] = (compute_local[(7)] + (A_shared_local[(3)] * B_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (A_shared_local[(4)] * B_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (A_shared_local[(5)] * B_shared_local[(1)]));\n compute_local[(13)] = (compute_local[(13)] + (A_shared_local[(6)] * B_shared_local[(1)]));\n compute_local[(15)] = (compute_local[(15)] + (A_shared_local[(7)] * B_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (A_shared_local[(8)] * B_shared_local[(1)]));\n compute_local[(19)] = (compute_local[(19)] + (A_shared_local[(9)] * B_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (A_shared_local[(10)] * B_shared_local[(1)]));\n compute_local[(23)] = (compute_local[(23)] + (A_shared_local[(11)] * B_shared_local[(1)]));\n compute_local[(25)] = (compute_local[(25)] + (A_shared_local[(12)] * B_shared_local[(1)]));\n compute_local[(27)] = (compute_local[(27)] + (A_shared_local[(13)] * B_shared_local[(1)]));\n compute_local[(29)] = (compute_local[(29)] + (A_shared_local[(14)] * B_shared_local[(1)]));\n compute_local[(31)] = (compute_local[(31)] + (A_shared_local[(15)] * B_shared_local[(1)]));\n }\n compute[((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)))] = compute_local[(0)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 16384))] = compute_local[(2)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 32768))] = compute_local[(4)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 49152))] = compute_local[(6)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 65536))] = compute_local[(8)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 81920))] = compute_local[(10)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 98304))] = compute_local[(12)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 114688))] = compute_local[(14)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 131072))] = compute_local[(16)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 147456))] = compute_local[(18)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 163840))] = compute_local[(20)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 180224))] = compute_local[(22)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 196608))] = compute_local[(24)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 212992))] = compute_local[(26)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 229376))] = compute_local[(28)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 245760))] = compute_local[(30)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 32))] = compute_local[(1)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 16416))] = compute_local[(3)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 32800))] = compute_local[(5)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 49184))] = compute_local[(7)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 65568))] = compute_local[(9)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 81952))] = compute_local[(11)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 98336))] = compute_local[(13)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 114720))] = compute_local[(15)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 131104))] = compute_local[(17)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 147488))] = compute_local[(19)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 163872))] = compute_local[(21)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 180256))] = compute_local[(23)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 196640))] = compute_local[(25)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 213024))] = compute_local[(27)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 229408))] = compute_local[(29)];\n compute[(((((((((int)blockIdx.x) >> 4) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 15) * 64)) + (((int)threadIdx.x) & 31)) + 245792))] = compute_local[(31)];\n}\n", "gridDim": [4096, 1, 1], "blockDim": [512, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,30522]_[30522,1024]_[65536,1024].json b/src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,30522]_[30522,1024]_[65536,1024].json new file mode 100644 index 000000000..731941f30 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,30522]_[30522,1024]_[65536,1024].json @@ -0,0 +1 @@ +{"parameters": {"arg0_shape": [65536, 30522], "arg1_shape": [30522, 1024], "out_shape": [65536, 1024], "transpose_A": false, "transpose_B": false}, "op_type": "Dot", "tvm_func_name": "roller_Dot__65536_30522___30522_1024___65536_1024_", "code": "extern \"C\" __global__ void roller_Dot__65536_30522___30522_1024___65536_1024_(float* __restrict__ A, float* __restrict__ B, float* __restrict__ compute) {\n float compute_local[64];\n __shared__ float A_shared[2112];\n __shared__ float B_shared[4096];\n float A_shared_local[16];\n float B_shared_local[4];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 954; ++k_outer) {\n __syncthreads();\n if (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 30522) {\n A_shared[((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)))] = A[((((((((int)blockIdx.x) >> 3) * 1953408) + ((((int)threadIdx.x) >> 5) * 30522)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n }\n if (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 30522) {\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 132))] = A[(((((((((int)blockIdx.x) >> 3) * 1953408) + ((((int)threadIdx.x) >> 5) * 30522)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 122088))];\n }\n if (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 30522) {\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 264))] = A[(((((((((int)blockIdx.x) >> 3) * 1953408) + ((((int)threadIdx.x) >> 5) * 30522)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 244176))];\n }\n if (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 30522) {\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 396))] = A[(((((((((int)blockIdx.x) >> 3) * 1953408) + ((((int)threadIdx.x) >> 5) * 30522)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 366264))];\n }\n if (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 30522) {\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 528))] = A[(((((((((int)blockIdx.x) >> 3) * 1953408) + ((((int)threadIdx.x) >> 5) * 30522)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 488352))];\n }\n if (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 30522) {\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 660))] = A[(((((((((int)blockIdx.x) >> 3) * 1953408) + ((((int)threadIdx.x) >> 5) * 30522)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 610440))];\n }\n if (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 30522) {\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 792))] = A[(((((((((int)blockIdx.x) >> 3) * 1953408) + ((((int)threadIdx.x) >> 5) * 30522)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 732528))];\n }\n if (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 30522) {\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 924))] = A[(((((((((int)blockIdx.x) >> 3) * 1953408) + ((((int)threadIdx.x) >> 5) * 30522)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 854616))];\n }\n if (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 30522) {\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1056))] = A[(((((((((int)blockIdx.x) >> 3) * 1953408) + ((((int)threadIdx.x) >> 5) * 30522)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 976704))];\n }\n if (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 30522) {\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1188))] = A[(((((((((int)blockIdx.x) >> 3) * 1953408) + ((((int)threadIdx.x) >> 5) * 30522)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1098792))];\n }\n if (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 30522) {\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1320))] = A[(((((((((int)blockIdx.x) >> 3) * 1953408) + ((((int)threadIdx.x) >> 5) * 30522)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1220880))];\n }\n if (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 30522) {\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1452))] = A[(((((((((int)blockIdx.x) >> 3) * 1953408) + ((((int)threadIdx.x) >> 5) * 30522)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1342968))];\n }\n if (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 30522) {\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1584))] = A[(((((((((int)blockIdx.x) >> 3) * 1953408) + ((((int)threadIdx.x) >> 5) * 30522)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1465056))];\n }\n if (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 30522) {\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1716))] = A[(((((((((int)blockIdx.x) >> 3) * 1953408) + ((((int)threadIdx.x) >> 5) * 30522)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1587144))];\n }\n if (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 30522) {\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1848))] = A[(((((((((int)blockIdx.x) >> 3) * 1953408) + ((((int)threadIdx.x) >> 5) * 30522)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1709232))];\n }\n if (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 30522) {\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1980))] = A[(((((((((int)blockIdx.x) >> 3) * 1953408) + ((((int)threadIdx.x) >> 5) * 30522)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1831320))];\n }\n B_shared[(((int)threadIdx.x))] = B[((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)))];\n B_shared[((((int)threadIdx.x) + 128))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 1024))];\n B_shared[((((int)threadIdx.x) + 256))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 2048))];\n B_shared[((((int)threadIdx.x) + 384))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 3072))];\n B_shared[((((int)threadIdx.x) + 512))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 4096))];\n B_shared[((((int)threadIdx.x) + 640))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 5120))];\n B_shared[((((int)threadIdx.x) + 768))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 6144))];\n B_shared[((((int)threadIdx.x) + 896))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 7168))];\n B_shared[((((int)threadIdx.x) + 1024))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 8192))];\n B_shared[((((int)threadIdx.x) + 1152))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 9216))];\n B_shared[((((int)threadIdx.x) + 1280))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 10240))];\n B_shared[((((int)threadIdx.x) + 1408))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 11264))];\n B_shared[((((int)threadIdx.x) + 1536))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 12288))];\n B_shared[((((int)threadIdx.x) + 1664))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 13312))];\n B_shared[((((int)threadIdx.x) + 1792))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 14336))];\n B_shared[((((int)threadIdx.x) + 1920))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 15360))];\n B_shared[((((int)threadIdx.x) + 2048))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 16384))];\n B_shared[((((int)threadIdx.x) + 2176))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 17408))];\n B_shared[((((int)threadIdx.x) + 2304))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 18432))];\n B_shared[((((int)threadIdx.x) + 2432))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 19456))];\n B_shared[((((int)threadIdx.x) + 2560))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 20480))];\n B_shared[((((int)threadIdx.x) + 2688))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 21504))];\n B_shared[((((int)threadIdx.x) + 2816))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 22528))];\n B_shared[((((int)threadIdx.x) + 2944))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 23552))];\n B_shared[((((int)threadIdx.x) + 3072))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 24576))];\n B_shared[((((int)threadIdx.x) + 3200))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 25600))];\n if (k_outer < 953) {\n B_shared[((((int)threadIdx.x) + 3328))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 26624))];\n }\n if (k_outer < 953) {\n B_shared[((((int)threadIdx.x) + 3456))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 27648))];\n }\n if (k_outer < 953) {\n B_shared[((((int)threadIdx.x) + 3584))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 28672))];\n }\n if (k_outer < 953) {\n B_shared[((((int)threadIdx.x) + 3712))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 29696))];\n }\n if (k_outer < 953) {\n B_shared[((((int)threadIdx.x) + 3840))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 30720))];\n }\n if (k_outer < 953) {\n B_shared[((((int)threadIdx.x) + 3968))] = B[(((((k_outer * 32768) + ((((int)blockIdx.x) & 7) * 128)) + ((int)threadIdx.x)) + 31744))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n if (((k_outer * 32) + k_inner_outer) < 30522) {\n A_shared_local[(0)] = A_shared[((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer))];\n A_shared_local[(1)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 132))];\n A_shared_local[(2)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 264))];\n A_shared_local[(3)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 396))];\n A_shared_local[(4)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 528))];\n A_shared_local[(5)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 660))];\n A_shared_local[(6)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 792))];\n A_shared_local[(7)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 924))];\n A_shared_local[(8)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 1056))];\n A_shared_local[(9)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 1188))];\n A_shared_local[(10)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 1320))];\n A_shared_local[(11)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 1452))];\n A_shared_local[(12)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 1584))];\n A_shared_local[(13)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 1716))];\n A_shared_local[(14)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 1848))];\n A_shared_local[(15)] = A_shared[(((((((int)threadIdx.x) >> 5) * 33) + k_inner_outer) + 1980))];\n }\n if (((k_outer * 32) + k_inner_outer) < 30522) {\n B_shared_local[(0)] = B_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n B_shared_local[(1)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n B_shared_local[(2)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n B_shared_local[(3)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n }\n if (((k_outer * 32) + k_inner_outer) < 30522) {\n compute_local[(0)] = (compute_local[(0)] + (A_shared_local[(0)] * B_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (A_shared_local[(1)] * B_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (A_shared_local[(2)] * B_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (A_shared_local[(3)] * B_shared_local[(0)]));\n compute_local[(16)] = (compute_local[(16)] + (A_shared_local[(4)] * B_shared_local[(0)]));\n compute_local[(20)] = (compute_local[(20)] + (A_shared_local[(5)] * B_shared_local[(0)]));\n compute_local[(24)] = (compute_local[(24)] + (A_shared_local[(6)] * B_shared_local[(0)]));\n compute_local[(28)] = (compute_local[(28)] + (A_shared_local[(7)] * B_shared_local[(0)]));\n compute_local[(32)] = (compute_local[(32)] + (A_shared_local[(8)] * B_shared_local[(0)]));\n compute_local[(36)] = (compute_local[(36)] + (A_shared_local[(9)] * B_shared_local[(0)]));\n compute_local[(40)] = (compute_local[(40)] + (A_shared_local[(10)] * B_shared_local[(0)]));\n compute_local[(44)] = (compute_local[(44)] + (A_shared_local[(11)] * B_shared_local[(0)]));\n compute_local[(48)] = (compute_local[(48)] + (A_shared_local[(12)] * B_shared_local[(0)]));\n compute_local[(52)] = (compute_local[(52)] + (A_shared_local[(13)] * B_shared_local[(0)]));\n compute_local[(56)] = (compute_local[(56)] + (A_shared_local[(14)] * B_shared_local[(0)]));\n compute_local[(60)] = (compute_local[(60)] + (A_shared_local[(15)] * B_shared_local[(0)]));\n compute_local[(1)] = (compute_local[(1)] + (A_shared_local[(0)] * B_shared_local[(1)]));\n compute_local[(5)] = (compute_local[(5)] + (A_shared_local[(1)] * B_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (A_shared_local[(2)] * B_shared_local[(1)]));\n compute_local[(13)] = (compute_local[(13)] + (A_shared_local[(3)] * B_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (A_shared_local[(4)] * B_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (A_shared_local[(5)] * B_shared_local[(1)]));\n compute_local[(25)] = (compute_local[(25)] + (A_shared_local[(6)] * B_shared_local[(1)]));\n compute_local[(29)] = (compute_local[(29)] + (A_shared_local[(7)] * B_shared_local[(1)]));\n compute_local[(33)] = (compute_local[(33)] + (A_shared_local[(8)] * B_shared_local[(1)]));\n compute_local[(37)] = (compute_local[(37)] + (A_shared_local[(9)] * B_shared_local[(1)]));\n compute_local[(41)] = (compute_local[(41)] + (A_shared_local[(10)] * B_shared_local[(1)]));\n compute_local[(45)] = (compute_local[(45)] + (A_shared_local[(11)] * B_shared_local[(1)]));\n compute_local[(49)] = (compute_local[(49)] + (A_shared_local[(12)] * B_shared_local[(1)]));\n compute_local[(53)] = (compute_local[(53)] + (A_shared_local[(13)] * B_shared_local[(1)]));\n compute_local[(57)] = (compute_local[(57)] + (A_shared_local[(14)] * B_shared_local[(1)]));\n compute_local[(61)] = (compute_local[(61)] + (A_shared_local[(15)] * B_shared_local[(1)]));\n compute_local[(2)] = (compute_local[(2)] + (A_shared_local[(0)] * B_shared_local[(2)]));\n compute_local[(6)] = (compute_local[(6)] + (A_shared_local[(1)] * B_shared_local[(2)]));\n compute_local[(10)] = (compute_local[(10)] + (A_shared_local[(2)] * B_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (A_shared_local[(3)] * B_shared_local[(2)]));\n compute_local[(18)] = (compute_local[(18)] + (A_shared_local[(4)] * B_shared_local[(2)]));\n compute_local[(22)] = (compute_local[(22)] + (A_shared_local[(5)] * B_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (A_shared_local[(6)] * B_shared_local[(2)]));\n compute_local[(30)] = (compute_local[(30)] + (A_shared_local[(7)] * B_shared_local[(2)]));\n compute_local[(34)] = (compute_local[(34)] + (A_shared_local[(8)] * B_shared_local[(2)]));\n compute_local[(38)] = (compute_local[(38)] + (A_shared_local[(9)] * B_shared_local[(2)]));\n compute_local[(42)] = (compute_local[(42)] + (A_shared_local[(10)] * B_shared_local[(2)]));\n compute_local[(46)] = (compute_local[(46)] + (A_shared_local[(11)] * B_shared_local[(2)]));\n compute_local[(50)] = (compute_local[(50)] + (A_shared_local[(12)] * B_shared_local[(2)]));\n compute_local[(54)] = (compute_local[(54)] + (A_shared_local[(13)] * B_shared_local[(2)]));\n compute_local[(58)] = (compute_local[(58)] + (A_shared_local[(14)] * B_shared_local[(2)]));\n compute_local[(62)] = (compute_local[(62)] + (A_shared_local[(15)] * B_shared_local[(2)]));\n compute_local[(3)] = (compute_local[(3)] + (A_shared_local[(0)] * B_shared_local[(3)]));\n compute_local[(7)] = (compute_local[(7)] + (A_shared_local[(1)] * B_shared_local[(3)]));\n compute_local[(11)] = (compute_local[(11)] + (A_shared_local[(2)] * B_shared_local[(3)]));\n compute_local[(15)] = (compute_local[(15)] + (A_shared_local[(3)] * B_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (A_shared_local[(4)] * B_shared_local[(3)]));\n compute_local[(23)] = (compute_local[(23)] + (A_shared_local[(5)] * B_shared_local[(3)]));\n compute_local[(27)] = (compute_local[(27)] + (A_shared_local[(6)] * B_shared_local[(3)]));\n compute_local[(31)] = (compute_local[(31)] + (A_shared_local[(7)] * B_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (A_shared_local[(8)] * B_shared_local[(3)]));\n compute_local[(39)] = (compute_local[(39)] + (A_shared_local[(9)] * B_shared_local[(3)]));\n compute_local[(43)] = (compute_local[(43)] + (A_shared_local[(10)] * B_shared_local[(3)]));\n compute_local[(47)] = (compute_local[(47)] + (A_shared_local[(11)] * B_shared_local[(3)]));\n compute_local[(51)] = (compute_local[(51)] + (A_shared_local[(12)] * B_shared_local[(3)]));\n compute_local[(55)] = (compute_local[(55)] + (A_shared_local[(13)] * B_shared_local[(3)]));\n compute_local[(59)] = (compute_local[(59)] + (A_shared_local[(14)] * B_shared_local[(3)]));\n compute_local[(63)] = (compute_local[(63)] + (A_shared_local[(15)] * B_shared_local[(3)]));\n }\n }\n }\n compute[((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)))] = compute_local[(0)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 4096))] = compute_local[(4)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 8192))] = compute_local[(8)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 12288))] = compute_local[(12)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 16384))] = compute_local[(16)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 20480))] = compute_local[(20)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 24576))] = compute_local[(24)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 28672))] = compute_local[(28)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 32768))] = compute_local[(32)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 36864))] = compute_local[(36)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 40960))] = compute_local[(40)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 45056))] = compute_local[(44)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 49152))] = compute_local[(48)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 53248))] = compute_local[(52)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 57344))] = compute_local[(56)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 61440))] = compute_local[(60)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = compute_local[(1)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 4128))] = compute_local[(5)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 8224))] = compute_local[(9)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 12320))] = compute_local[(13)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 16416))] = compute_local[(17)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 20512))] = compute_local[(21)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 24608))] = compute_local[(25)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 28704))] = compute_local[(29)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 32800))] = compute_local[(33)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 36896))] = compute_local[(37)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 40992))] = compute_local[(41)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 45088))] = compute_local[(45)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 49184))] = compute_local[(49)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 53280))] = compute_local[(53)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 57376))] = compute_local[(57)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 61472))] = compute_local[(61)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = compute_local[(2)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 4160))] = compute_local[(6)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 8256))] = compute_local[(10)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 12352))] = compute_local[(14)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 16448))] = compute_local[(18)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 20544))] = compute_local[(22)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 24640))] = compute_local[(26)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 28736))] = compute_local[(30)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 32832))] = compute_local[(34)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 36928))] = compute_local[(38)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 41024))] = compute_local[(42)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 45120))] = compute_local[(46)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 49216))] = compute_local[(50)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 53312))] = compute_local[(54)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 57408))] = compute_local[(58)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 61504))] = compute_local[(62)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = compute_local[(3)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 4192))] = compute_local[(7)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 8288))] = compute_local[(11)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 12384))] = compute_local[(15)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 16480))] = compute_local[(19)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 20576))] = compute_local[(23)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 24672))] = compute_local[(27)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 28768))] = compute_local[(31)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 32864))] = compute_local[(35)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 36960))] = compute_local[(39)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 41056))] = compute_local[(43)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 45152))] = compute_local[(47)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 49248))] = compute_local[(51)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 53344))] = compute_local[(55)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 57440))] = compute_local[(59)];\n compute[(((((((((int)blockIdx.x) >> 3) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 31)) + 61536))] = compute_local[(63)];\n}\n", "gridDim": [8192, 1, 1], "blockDim": [128, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,4096]_[4096,1024]_[65536,1024].json b/src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,4096]_[4096,1024]_[65536,1024].json new file mode 100644 index 000000000..857df6939 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_bert/roller_Dot_[65536,4096]_[4096,1024]_[65536,1024].json @@ -0,0 +1 @@ +{"parameters": {"arg0_shape": [65536, 4096], "arg1_shape": [4096, 1024], "out_shape": [65536, 1024], "transpose_A": false, "transpose_B": false}, "op_type": "Dot", "tvm_func_name": "roller_Dot__65536_4096___4096_1024___65536_1024_", "code": "extern \"C\" __global__ void roller_Dot__65536_4096___4096_1024___65536_1024_(float* __restrict__ A, float* __restrict__ B, float* __restrict__ compute) {\n float compute_local[64];\n __shared__ float A_shared[4224];\n __shared__ float B_shared[4096];\n float A_shared_local[8];\n float B_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 128; ++k_outer) {\n __syncthreads();\n A_shared[((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)))] = A[((((((((int)blockIdx.x) >> 3) * 524288) + ((((int)threadIdx.x) >> 5) * 4096)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 264))] = A[(((((((((int)blockIdx.x) >> 3) * 524288) + ((((int)threadIdx.x) >> 5) * 4096)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32768))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 528))] = A[(((((((((int)blockIdx.x) >> 3) * 524288) + ((((int)threadIdx.x) >> 5) * 4096)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 65536))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 792))] = A[(((((((((int)blockIdx.x) >> 3) * 524288) + ((((int)threadIdx.x) >> 5) * 4096)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 98304))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1056))] = A[(((((((((int)blockIdx.x) >> 3) * 524288) + ((((int)threadIdx.x) >> 5) * 4096)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 131072))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1320))] = A[(((((((((int)blockIdx.x) >> 3) * 524288) + ((((int)threadIdx.x) >> 5) * 4096)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 163840))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1584))] = A[(((((((((int)blockIdx.x) >> 3) * 524288) + ((((int)threadIdx.x) >> 5) * 4096)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 196608))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1848))] = A[(((((((((int)blockIdx.x) >> 3) * 524288) + ((((int)threadIdx.x) >> 5) * 4096)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 229376))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2112))] = A[(((((((((int)blockIdx.x) >> 3) * 524288) + ((((int)threadIdx.x) >> 5) * 4096)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 262144))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2376))] = A[(((((((((int)blockIdx.x) >> 3) * 524288) + ((((int)threadIdx.x) >> 5) * 4096)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 294912))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2640))] = A[(((((((((int)blockIdx.x) >> 3) * 524288) + ((((int)threadIdx.x) >> 5) * 4096)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 327680))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2904))] = A[(((((((((int)blockIdx.x) >> 3) * 524288) + ((((int)threadIdx.x) >> 5) * 4096)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 360448))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3168))] = A[(((((((((int)blockIdx.x) >> 3) * 524288) + ((((int)threadIdx.x) >> 5) * 4096)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 393216))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3432))] = A[(((((((((int)blockIdx.x) >> 3) * 524288) + ((((int)threadIdx.x) >> 5) * 4096)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 425984))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3696))] = A[(((((((((int)blockIdx.x) >> 3) * 524288) + ((((int)threadIdx.x) >> 5) * 4096)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 458752))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3960))] = A[(((((((((int)blockIdx.x) >> 3) * 524288) + ((((int)threadIdx.x) >> 5) * 4096)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 491520))];\n B_shared[(((int)threadIdx.x))] = B[(((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)))];\n B_shared[((((int)threadIdx.x) + 256))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 2048))];\n B_shared[((((int)threadIdx.x) + 512))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 4096))];\n B_shared[((((int)threadIdx.x) + 768))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 6144))];\n B_shared[((((int)threadIdx.x) + 1024))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 8192))];\n B_shared[((((int)threadIdx.x) + 1280))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 10240))];\n B_shared[((((int)threadIdx.x) + 1536))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 12288))];\n B_shared[((((int)threadIdx.x) + 1792))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 14336))];\n B_shared[((((int)threadIdx.x) + 2048))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 16384))];\n B_shared[((((int)threadIdx.x) + 2304))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 18432))];\n B_shared[((((int)threadIdx.x) + 2560))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 20480))];\n B_shared[((((int)threadIdx.x) + 2816))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 22528))];\n B_shared[((((int)threadIdx.x) + 3072))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 24576))];\n B_shared[((((int)threadIdx.x) + 3328))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 26624))];\n B_shared[((((int)threadIdx.x) + 3584))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 28672))];\n B_shared[((((int)threadIdx.x) + 3840))] = B[((((((k_outer * 32768) + ((((int)threadIdx.x) >> 7) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 127)) + 30720))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n A_shared_local[(0)] = A_shared[((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer))];\n A_shared_local[(1)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 528))];\n A_shared_local[(2)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 1056))];\n A_shared_local[(3)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 1584))];\n A_shared_local[(4)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 2112))];\n A_shared_local[(5)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 2640))];\n A_shared_local[(6)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 3168))];\n A_shared_local[(7)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 3696))];\n B_shared_local[(0)] = B_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 15)))];\n B_shared_local[(1)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 16))];\n B_shared_local[(2)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 32))];\n B_shared_local[(3)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 48))];\n B_shared_local[(4)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 64))];\n B_shared_local[(5)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 80))];\n B_shared_local[(6)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 96))];\n B_shared_local[(7)] = B_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 112))];\n compute_local[(0)] = (compute_local[(0)] + (A_shared_local[(0)] * B_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (A_shared_local[(1)] * B_shared_local[(0)]));\n compute_local[(16)] = (compute_local[(16)] + (A_shared_local[(2)] * B_shared_local[(0)]));\n compute_local[(24)] = (compute_local[(24)] + (A_shared_local[(3)] * B_shared_local[(0)]));\n compute_local[(32)] = (compute_local[(32)] + (A_shared_local[(4)] * B_shared_local[(0)]));\n compute_local[(40)] = (compute_local[(40)] + (A_shared_local[(5)] * B_shared_local[(0)]));\n compute_local[(48)] = (compute_local[(48)] + (A_shared_local[(6)] * B_shared_local[(0)]));\n compute_local[(56)] = (compute_local[(56)] + (A_shared_local[(7)] * B_shared_local[(0)]));\n compute_local[(1)] = (compute_local[(1)] + (A_shared_local[(0)] * B_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (A_shared_local[(1)] * B_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (A_shared_local[(2)] * B_shared_local[(1)]));\n compute_local[(25)] = (compute_local[(25)] + (A_shared_local[(3)] * B_shared_local[(1)]));\n compute_local[(33)] = (compute_local[(33)] + (A_shared_local[(4)] * B_shared_local[(1)]));\n compute_local[(41)] = (compute_local[(41)] + (A_shared_local[(5)] * B_shared_local[(1)]));\n compute_local[(49)] = (compute_local[(49)] + (A_shared_local[(6)] * B_shared_local[(1)]));\n compute_local[(57)] = (compute_local[(57)] + (A_shared_local[(7)] * B_shared_local[(1)]));\n compute_local[(2)] = (compute_local[(2)] + (A_shared_local[(0)] * B_shared_local[(2)]));\n compute_local[(10)] = (compute_local[(10)] + (A_shared_local[(1)] * B_shared_local[(2)]));\n compute_local[(18)] = (compute_local[(18)] + (A_shared_local[(2)] * B_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (A_shared_local[(3)] * B_shared_local[(2)]));\n compute_local[(34)] = (compute_local[(34)] + (A_shared_local[(4)] * B_shared_local[(2)]));\n compute_local[(42)] = (compute_local[(42)] + (A_shared_local[(5)] * B_shared_local[(2)]));\n compute_local[(50)] = (compute_local[(50)] + (A_shared_local[(6)] * B_shared_local[(2)]));\n compute_local[(58)] = (compute_local[(58)] + (A_shared_local[(7)] * B_shared_local[(2)]));\n compute_local[(3)] = (compute_local[(3)] + (A_shared_local[(0)] * B_shared_local[(3)]));\n compute_local[(11)] = (compute_local[(11)] + (A_shared_local[(1)] * B_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (A_shared_local[(2)] * B_shared_local[(3)]));\n compute_local[(27)] = (compute_local[(27)] + (A_shared_local[(3)] * B_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (A_shared_local[(4)] * B_shared_local[(3)]));\n compute_local[(43)] = (compute_local[(43)] + (A_shared_local[(5)] * B_shared_local[(3)]));\n compute_local[(51)] = (compute_local[(51)] + (A_shared_local[(6)] * B_shared_local[(3)]));\n compute_local[(59)] = (compute_local[(59)] + (A_shared_local[(7)] * B_shared_local[(3)]));\n compute_local[(4)] = (compute_local[(4)] + (A_shared_local[(0)] * B_shared_local[(4)]));\n compute_local[(12)] = (compute_local[(12)] + (A_shared_local[(1)] * B_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (A_shared_local[(2)] * B_shared_local[(4)]));\n compute_local[(28)] = (compute_local[(28)] + (A_shared_local[(3)] * B_shared_local[(4)]));\n compute_local[(36)] = (compute_local[(36)] + (A_shared_local[(4)] * B_shared_local[(4)]));\n compute_local[(44)] = (compute_local[(44)] + (A_shared_local[(5)] * B_shared_local[(4)]));\n compute_local[(52)] = (compute_local[(52)] + (A_shared_local[(6)] * B_shared_local[(4)]));\n compute_local[(60)] = (compute_local[(60)] + (A_shared_local[(7)] * B_shared_local[(4)]));\n compute_local[(5)] = (compute_local[(5)] + (A_shared_local[(0)] * B_shared_local[(5)]));\n compute_local[(13)] = (compute_local[(13)] + (A_shared_local[(1)] * B_shared_local[(5)]));\n compute_local[(21)] = (compute_local[(21)] + (A_shared_local[(2)] * B_shared_local[(5)]));\n compute_local[(29)] = (compute_local[(29)] + (A_shared_local[(3)] * B_shared_local[(5)]));\n compute_local[(37)] = (compute_local[(37)] + (A_shared_local[(4)] * B_shared_local[(5)]));\n compute_local[(45)] = (compute_local[(45)] + (A_shared_local[(5)] * B_shared_local[(5)]));\n compute_local[(53)] = (compute_local[(53)] + (A_shared_local[(6)] * B_shared_local[(5)]));\n compute_local[(61)] = (compute_local[(61)] + (A_shared_local[(7)] * B_shared_local[(5)]));\n compute_local[(6)] = (compute_local[(6)] + (A_shared_local[(0)] * B_shared_local[(6)]));\n compute_local[(14)] = (compute_local[(14)] + (A_shared_local[(1)] * B_shared_local[(6)]));\n compute_local[(22)] = (compute_local[(22)] + (A_shared_local[(2)] * B_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (A_shared_local[(3)] * B_shared_local[(6)]));\n compute_local[(38)] = (compute_local[(38)] + (A_shared_local[(4)] * B_shared_local[(6)]));\n compute_local[(46)] = (compute_local[(46)] + (A_shared_local[(5)] * B_shared_local[(6)]));\n compute_local[(54)] = (compute_local[(54)] + (A_shared_local[(6)] * B_shared_local[(6)]));\n compute_local[(62)] = (compute_local[(62)] + (A_shared_local[(7)] * B_shared_local[(6)]));\n compute_local[(7)] = (compute_local[(7)] + (A_shared_local[(0)] * B_shared_local[(7)]));\n compute_local[(15)] = (compute_local[(15)] + (A_shared_local[(1)] * B_shared_local[(7)]));\n compute_local[(23)] = (compute_local[(23)] + (A_shared_local[(2)] * B_shared_local[(7)]));\n compute_local[(31)] = (compute_local[(31)] + (A_shared_local[(3)] * B_shared_local[(7)]));\n compute_local[(39)] = (compute_local[(39)] + (A_shared_local[(4)] * B_shared_local[(7)]));\n compute_local[(47)] = (compute_local[(47)] + (A_shared_local[(5)] * B_shared_local[(7)]));\n compute_local[(55)] = (compute_local[(55)] + (A_shared_local[(6)] * B_shared_local[(7)]));\n compute_local[(63)] = (compute_local[(63)] + (A_shared_local[(7)] * B_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)))] = compute_local[(0)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16384))] = compute_local[(8)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32768))] = compute_local[(16)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 49152))] = compute_local[(24)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 65536))] = compute_local[(32)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 81920))] = compute_local[(40)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 98304))] = compute_local[(48)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 114688))] = compute_local[(56)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16))] = compute_local[(1)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16400))] = compute_local[(9)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32784))] = compute_local[(17)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 49168))] = compute_local[(25)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 65552))] = compute_local[(33)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 81936))] = compute_local[(41)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 98320))] = compute_local[(49)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 114704))] = compute_local[(57)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32))] = compute_local[(2)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16416))] = compute_local[(10)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32800))] = compute_local[(18)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 49184))] = compute_local[(26)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 65568))] = compute_local[(34)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 81952))] = compute_local[(42)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 98336))] = compute_local[(50)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 114720))] = compute_local[(58)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 48))] = compute_local[(3)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16432))] = compute_local[(11)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32816))] = compute_local[(19)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 49200))] = compute_local[(27)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 65584))] = compute_local[(35)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 81968))] = compute_local[(43)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 98352))] = compute_local[(51)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 114736))] = compute_local[(59)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 64))] = compute_local[(4)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16448))] = compute_local[(12)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32832))] = compute_local[(20)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 49216))] = compute_local[(28)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 65600))] = compute_local[(36)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 81984))] = compute_local[(44)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 98368))] = compute_local[(52)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 114752))] = compute_local[(60)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 80))] = compute_local[(5)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16464))] = compute_local[(13)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32848))] = compute_local[(21)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 49232))] = compute_local[(29)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 65616))] = compute_local[(37)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 82000))] = compute_local[(45)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 98384))] = compute_local[(53)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 114768))] = compute_local[(61)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 96))] = compute_local[(6)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16480))] = compute_local[(14)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32864))] = compute_local[(22)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 49248))] = compute_local[(30)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 65632))] = compute_local[(38)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 82016))] = compute_local[(46)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 98400))] = compute_local[(54)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 114784))] = compute_local[(62)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 112))] = compute_local[(7)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 16496))] = compute_local[(15)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 32880))] = compute_local[(23)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 49264))] = compute_local[(31)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 65648))] = compute_local[(39)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 82032))] = compute_local[(47)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 98416))] = compute_local[(55)];\n compute[(((((((((int)blockIdx.x) >> 3) * 131072) + ((((int)threadIdx.x) >> 4) * 1024)) + ((((int)blockIdx.x) & 7) * 128)) + (((int)threadIdx.x) & 15)) + 114800))] = compute_local[(63)];\n}\n", "gridDim": [4096, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_bert/roller_Sum_[128,512,1024]_[128,512].json b/src/tools/nnfusion/kernel_db/roller_bert/roller_Sum_[128,512,1024]_[128,512].json new file mode 100644 index 000000000..996f6f2ea --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_bert/roller_Sum_[128,512,1024]_[128,512].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 512, 1024], "output_shape": [128, 512], "reduction_axis": [2]}, "op_type": "Sum", "tvm_func_name": "roller_Sum__128_512_1024___128_512_", "code": "extern \"C\" __global__ void roller_Sum__128_512_1024___128_512_(float* __restrict__ A, float* __restrict__ compute) {\n float compute_local[2];\n \n __shared__ float A_shared[8448];\n float A_shared_local[2];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 32; ++k_outer) {\n __syncthreads();\n A_shared[((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)))] = A[(((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 132))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4096))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 264))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 396))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12288))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 528))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 660))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 20480))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 792))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24576))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 924))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 28672))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1056))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32768))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1188))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 36864))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1320))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 40960))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1452))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 45056))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1584))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 49152))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1716))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 53248))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1848))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 57344))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1980))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 61440))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2112))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 65536))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2244))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 69632))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2376))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 73728))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2508))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 77824))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2640))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 81920))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2772))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 86016))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2904))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 90112))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3036))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 94208))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3168))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 98304))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3300))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 102400))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3432))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 106496))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3564))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 110592))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3696))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 114688))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3828))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 118784))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3960))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 122880))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 4092))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 126976))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 4224))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 131072))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 4356))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 135168))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 4488))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 139264))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 4620))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 143360))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 4752))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 147456))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 4884))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 151552))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 5016))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 155648))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 5148))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 159744))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 5280))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 163840))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 5412))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 167936))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 5544))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 172032))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 5676))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 176128))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 5808))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 180224))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 5940))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 184320))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 6072))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 188416))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 6204))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 192512))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 6336))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 196608))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 6468))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 200704))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 6600))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 204800))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 6732))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 208896))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 6864))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 212992))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 6996))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 217088))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 7128))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 221184))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 7260))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 225280))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 7392))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 229376))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 7524))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 233472))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 7656))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 237568))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 7788))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 241664))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 7920))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 245760))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 8052))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 249856))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 8184))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 253952))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 8316))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 258048))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n A_shared_local[(0)] = A_shared[(((((int)threadIdx.x) * 33) + k_inner_outer))];\n A_shared_local[(1)] = A_shared[((((((int)threadIdx.x) * 33) + k_inner_outer) + 4224))];\n compute_local[(0)] = (compute_local[(0)] + A_shared_local[(0)]);\n compute_local[(1)] = (compute_local[(1)] + A_shared_local[(1)]);\n }\n }\n compute[(((((int)blockIdx.x) * 256) + ((int)threadIdx.x)))] = compute_local[(0)];\n compute[((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) + 128))] = compute_local[(1)];\n}\n", "gridDim": [256, 1, 1], "blockDim": [128, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_bert/roller_Sum_[65536,1024]_[65536].json b/src/tools/nnfusion/kernel_db/roller_bert/roller_Sum_[65536,1024]_[65536].json new file mode 100644 index 000000000..73657763c --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_bert/roller_Sum_[65536,1024]_[65536].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [65536, 1024], "output_shape": [65536], "reduction_axis": [1]}, "op_type": "Sum", "tvm_func_name": "roller_Sum__65536_1024___65536_", "code": "extern \"C\" __global__ void roller_Sum__65536_1024___65536_(float* __restrict__ A, float* __restrict__ compute) {\n float compute_local[2];\n __shared__ float A_shared[8448];\n float A_shared_local[2];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 32; ++k_outer) {\n __syncthreads();\n A_shared[((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)))] = A[(((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 132))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4096))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 264))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 396))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12288))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 528))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 660))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 20480))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 792))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24576))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 924))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 28672))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1056))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32768))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1188))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 36864))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1320))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 40960))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1452))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 45056))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1584))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 49152))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1716))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 53248))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1848))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 57344))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1980))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 61440))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2112))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 65536))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2244))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 69632))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2376))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 73728))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2508))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 77824))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2640))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 81920))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2772))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 86016))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2904))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 90112))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3036))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 94208))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3168))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 98304))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3300))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 102400))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3432))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 106496))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3564))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 110592))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3696))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 114688))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3828))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 118784))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3960))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 122880))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 4092))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 126976))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 4224))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 131072))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 4356))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 135168))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 4488))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 139264))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 4620))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 143360))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 4752))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 147456))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 4884))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 151552))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 5016))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 155648))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 5148))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 159744))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 5280))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 163840))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 5412))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 167936))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 5544))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 172032))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 5676))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 176128))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 5808))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 180224))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 5940))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 184320))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 6072))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 188416))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 6204))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 192512))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 6336))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 196608))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 6468))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 200704))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 6600))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 204800))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 6732))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 208896))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 6864))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 212992))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 6996))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 217088))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 7128))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 221184))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 7260))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 225280))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 7392))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 229376))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 7524))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 233472))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 7656))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 237568))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 7788))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 241664))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 7920))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 245760))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 8052))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 249856))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 8184))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 253952))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 8316))] = A[((((((((int)blockIdx.x) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 258048))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n A_shared_local[(0)] = A_shared[(((((int)threadIdx.x) * 33) + k_inner_outer))];\n A_shared_local[(1)] = A_shared[((((((int)threadIdx.x) * 33) + k_inner_outer) + 4224))];\n compute_local[(0)] = (compute_local[(0)] + A_shared_local[(0)]);\n compute_local[(1)] = (compute_local[(1)] + A_shared_local[(1)]);\n }\n }\n compute[(((((int)blockIdx.x) * 256) + ((int)threadIdx.x)))] = compute_local[(0)];\n compute[((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) + 128))] = compute_local[(1)];\n}\n", "gridDim": [256, 1, 1], "blockDim": [128, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_lstm/roller_Dot_[128,256]_[256,256]_[256,256].json b/src/tools/nnfusion/kernel_db/roller_lstm/roller_Dot_[128,256]_[256,256]_[256,256].json new file mode 100644 index 000000000..cf4d048fb --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_lstm/roller_Dot_[128,256]_[256,256]_[256,256].json @@ -0,0 +1 @@ +{"parameters": {"arg0_shape": [128, 256], "arg1_shape": [256, 256], "out_shape": [128, 256], "transpose_A": false, "transpose_B": false}, "op_type": "Dot", "tvm_func_name": "roller_Dot__128_256___256_256___256_256_", "code": "extern \"C\" __global__ void roller_Dot__128_256___256_256___256_256_(float* __restrict__ A, float* __restrict__ B, float* __restrict__ compute) {\n float compute_local[32];\n __shared__ float A_shared[4224];\n __shared__ float B_shared[2048];\n float A_shared_local[8];\n float B_shared_local[4];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 8; ++k_outer) {\n __syncthreads();\n A_shared[((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)))] = A[(((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 528))] = A[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4096))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1056))] = A[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 1584))] = A[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12288))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2112))] = A[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 2640))] = A[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 20480))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3168))] = A[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24576))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 3696))] = A[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 28672))];\n B_shared[(((int)threadIdx.x))] = B[(((((k_outer * 8192) + ((((int)threadIdx.x) >> 6) * 256)) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 63)))];\n B_shared[((((int)threadIdx.x) + 512))] = B[((((((k_outer * 8192) + ((((int)threadIdx.x) >> 6) * 256)) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 63)) + 2048))];\n B_shared[((((int)threadIdx.x) + 1024))] = B[((((((k_outer * 8192) + ((((int)threadIdx.x) >> 6) * 256)) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 63)) + 4096))];\n B_shared[((((int)threadIdx.x) + 1536))] = B[((((((k_outer * 8192) + ((((int)threadIdx.x) >> 6) * 256)) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 63)) + 6144))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n A_shared_local[(0)] = A_shared[((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer))];\n A_shared_local[(1)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 1056))];\n A_shared_local[(2)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 2112))];\n A_shared_local[(3)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 3168))];\n B_shared_local[(0)] = B_shared[(((k_inner_outer * 64) + (((int)threadIdx.x) & 15)))];\n B_shared_local[(1)] = B_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 16))];\n B_shared_local[(2)] = B_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 32))];\n B_shared_local[(3)] = B_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 48))];\n compute_local[(0)] = (compute_local[(0)] + (A_shared_local[(0)] * B_shared_local[(0)]));\n compute_local[(1)] = (compute_local[(1)] + (A_shared_local[(0)] * B_shared_local[(1)]));\n compute_local[(2)] = (compute_local[(2)] + (A_shared_local[(0)] * B_shared_local[(2)]));\n compute_local[(3)] = (compute_local[(3)] + (A_shared_local[(0)] * B_shared_local[(3)]));\n compute_local[(4)] = (compute_local[(4)] + (A_shared_local[(1)] * B_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (A_shared_local[(1)] * B_shared_local[(1)]));\n compute_local[(6)] = (compute_local[(6)] + (A_shared_local[(1)] * B_shared_local[(2)]));\n compute_local[(7)] = (compute_local[(7)] + (A_shared_local[(1)] * B_shared_local[(3)]));\n compute_local[(8)] = (compute_local[(8)] + (A_shared_local[(2)] * B_shared_local[(0)]));\n compute_local[(9)] = (compute_local[(9)] + (A_shared_local[(2)] * B_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (A_shared_local[(2)] * B_shared_local[(2)]));\n compute_local[(11)] = (compute_local[(11)] + (A_shared_local[(2)] * B_shared_local[(3)]));\n compute_local[(12)] = (compute_local[(12)] + (A_shared_local[(3)] * B_shared_local[(0)]));\n compute_local[(13)] = (compute_local[(13)] + (A_shared_local[(3)] * B_shared_local[(1)]));\n compute_local[(14)] = (compute_local[(14)] + (A_shared_local[(3)] * B_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (A_shared_local[(3)] * B_shared_local[(3)]));\n }\n }\n compute[(((((((int)threadIdx.x) >> 4) * 256) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)))] = compute_local[(0)];\n compute[((((((((int)threadIdx.x) >> 4) * 256) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 16))] = compute_local[(1)];\n compute[((((((((int)threadIdx.x) >> 4) * 256) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 32))] = compute_local[(2)];\n compute[((((((((int)threadIdx.x) >> 4) * 256) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 48))] = compute_local[(3)];\n compute[((((((((int)threadIdx.x) >> 4) * 256) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 8192))] = compute_local[(4)];\n compute[((((((((int)threadIdx.x) >> 4) * 256) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 8208))] = compute_local[(5)];\n compute[((((((((int)threadIdx.x) >> 4) * 256) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 8224))] = compute_local[(6)];\n compute[((((((((int)threadIdx.x) >> 4) * 256) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 8240))] = compute_local[(7)];\n compute[((((((((int)threadIdx.x) >> 4) * 256) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 16384))] = compute_local[(8)];\n compute[((((((((int)threadIdx.x) >> 4) * 256) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 16400))] = compute_local[(9)];\n compute[((((((((int)threadIdx.x) >> 4) * 256) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 16416))] = compute_local[(10)];\n compute[((((((((int)threadIdx.x) >> 4) * 256) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 16432))] = compute_local[(11)];\n compute[((((((((int)threadIdx.x) >> 4) * 256) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 24576))] = compute_local[(12)];\n compute[((((((((int)threadIdx.x) >> 4) * 256) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 24592))] = compute_local[(13)];\n compute[((((((((int)threadIdx.x) >> 4) * 256) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 24608))] = compute_local[(14)];\n compute[((((((((int)threadIdx.x) >> 4) * 256) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 24624))] = compute_local[(15)];\n}\n", "gridDim": [4, 1, 1], "blockDim": [512, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/convert.sh b/src/tools/nnfusion/kernel_db/roller_nas/convert.sh new file mode 100755 index 000000000..3f865371c --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/convert.sh @@ -0,0 +1,58 @@ +# python ../convert_external.py roller_AvgPool_[128,1008,42,42]_[128,1008,21,21].json +# python ../convert_external.py roller_AvgPool_[128,168,42,42]_[128,168,42,42].json +# python ../convert_external.py roller_AvgPool_[128,168,83,83]_[128,168,42,42].json +# python ../convert_external.py roller_AvgPool_[128,2016,21,21]_[128,2016,11,11].json +# python ../convert_external.py roller_AvgPool_[128,336,21,21]_[128,336,21,21].json +# python ../convert_external.py roller_AvgPool_[128,336,42,42]_[128,336,21,21].json +# python ../convert_external.py roller_AvgPool_[128,42,165,165]_[128,42,83,83].json +# python ../convert_external.py roller_AvgPool_[128,42,83,83]_[128,42,83,83].json +# python ../convert_external.py roller_AvgPool_[128,672,11,11]_[128,672,11,11].json +# python ../convert_external.py roller_AvgPool_[128,672,21,21]_[128,672,11,11].json +# python ../convert_external.py roller_AvgPool_[128,84,42,42]_[128,84,42,42].json +# python ../convert_external.py roller_AvgPool_[128,84,83,83]_[128,84,42,42].json +# python ../convert_external.py roller_AvgPool_[128,96,165,165]_[128,96,83,83].json +# python ../convert_external.py roller_Convolution_[128,1008,21,21]_[168,1008,1,1]_[128,168,21,21].json +# python ../convert_external.py roller_Convolution_[128,1008,42,42]_[168,1008,1,1]_[128,168,42,42].json +# python ../convert_external.py roller_Convolution_[128,1008,42,42]_[336,1008,1,1]_[128,336,42,42].json +# python ../convert_external.py roller_Convolution_[128,1344,21,21]_[336,1344,1,1]_[128,336,21,21].json +# python ../convert_external.py roller_Convolution_[128,168,42,42]_[168,168,1,1]_[128,168,42,42]_relu.json +# python ../convert_external.py roller_Convolution_[128,168,42,42]_[84,168,1,1]_[128,84,42,42].json +# python ../convert_external.py roller_Convolution_[128,168,83,83]_[84,168,1,1]_[128,84,83,83].json +# python ../convert_external.py roller_Convolution_[128,2016,11,11]_[336,2016,1,1]_[128,336,11,11].json +# python ../convert_external.py roller_Convolution_[128,2016,21,21]_[336,2016,1,1]_[128,336,21,21].json +# python ../convert_external.py roller_Convolution_[128,2016,21,21]_[672,2016,1,1]_[128,672,21,21].json +# python ../convert_external.py roller_Convolution_[128,2688,11,11]_[672,2688,1,1]_[128,672,11,11].json +# python ../convert_external.py roller_Convolution_[128,3,331,331]_[96,3,3,3]_[128,96,165,165]_relu.json +# python ../convert_external.py roller_Convolution_[128,336,21,21]_[336,336,1,1]_[128,336,21,21]_relu.json +# python ../convert_external.py roller_Convolution_[128,336,42,42]_[168,336,1,1]_[128,168,42,42].json +# python ../convert_external.py roller_Convolution_[128,4032,11,11]_[672,4032,1,1]_[128,672,11,11].json +# python ../convert_external.py roller_Convolution_[128,42,83,83]_[42,42,1,1]_[128,42,83,83].json +# python ../convert_external.py roller_Convolution_[128,672,11,11]_[672,672,1,1]_[128,672,11,11]_relu.json +# python ../convert_external.py roller_Convolution_[128,84,42,42]_[84,84,1,1]_[128,84,42,42]_relu.json +# python ../convert_external.py roller_Convolution_[128,96,165,165]_[42,96,1,1]_[128,42,165,165].json +# python ../convert_external.py roller_Convolution_[128,96,83,83]_[42,96,1,1]_[128,42,83,83].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,168,42,42]_[3,3,168,1]_[128,168,42,42].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,168,42,42]_[5,5,168,1]_[128,168,42,42].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,336,21,21]_[3,3,336,1]_[128,336,21,21].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,336,21,21]_[5,5,336,1]_[128,336,21,21].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,336,21,21]_[7,7,336,1]_[128,336,21,21].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,336,45,45]_[5,5,336,1]_[128,336,21,21].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,336,47,47]_[7,7,336,1]_[128,336,21,21].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,42,165,165]_[5,5,42,1]_[128,42,83,83].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,42,83,83]_[3,3,42,1]_[128,42,83,83].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,42,83,83]_[5,5,42,1]_[128,42,83,83].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,42,83,83]_[7,7,42,1]_[128,42,83,83].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,672,11,11]_[3,3,672,1]_[128,672,11,11].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,672,11,11]_[5,5,672,1]_[128,672,11,11].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,672,11,11]_[7,7,672,1]_[128,672,11,11].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,672,21,21]_[5,5,672,1]_[128,672,11,11].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,672,21,21]_[7,7,672,1]_[128,672,11,11].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,84,42,42]_[3,3,84,1]_[128,84,42,42].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,84,42,42]_[5,5,84,1]_[128,84,42,42].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,84,42,42]_[7,7,84,1]_[128,84,42,42].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,84,83,83]_[5,5,84,1]_[128,84,42,42].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,84,83,83]_[7,7,84,1]_[128,84,42,42].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,96,165,165]_[5,5,96,1]_[128,96,83,83].json +# python ../convert_external.py roller_DepthwiseConv2dNative_[128,96,165,165]_[7,7,96,1]_[128,96,83,83].json +python ../convert_external.py roller_Dot_[128,4032]_[4032,1000]_[128,1000].json +# python ../convert_external.py roller_Sum_[128,4032,11,11]_[128,4032].json \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,1008,42,42]_[128,1008,21,21].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,1008,42,42]_[128,1008,21,21].json new file mode 100644 index 000000000..9765c7eaf --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,1008,42,42]_[128,1008,21,21].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 1008, 42, 42], "output_shape": [128, 1008, 21, 21], "window_shape": [1, 1], "window_stride": [2, 2], "padding_below": [0, 0]}, "op_type": "AvgPool", "tvm_func_name": "roller_AvgPool__128_1008_42_42___128_1008_21_21_", "code": "extern \"C\" __global__ void roller_AvgPool__128_1008_42_42___128_1008_21_21_(float* __restrict__ Pool2d, float* __restrict__ data) {\n __shared__ float compute_shared[1312];\n Pool2d[((((((((int)blockIdx.x) / 21) * 14112) + ((((int)threadIdx.x) / 12) * 441)) + ((((int)blockIdx.x) % 21) * 21)) + (((int)threadIdx.x) % 12)))] = 0.000000e+00f;\n if ((((int)threadIdx.x) % 12) < 9) {\n Pool2d[(((((((((int)blockIdx.x) / 21) * 14112) + ((((int)threadIdx.x) / 12) * 441)) + ((((int)blockIdx.x) % 21) * 21)) + (((int)threadIdx.x) % 12)) + 12))] = 0.000000e+00f;\n }\n compute_shared[(((int)threadIdx.x))] = data[((((((((int)blockIdx.x) / 21) * 56448) + ((((int)threadIdx.x) / 41) * 1764)) + ((((int)blockIdx.x) % 21) * 84)) + (((int)threadIdx.x) % 41)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[((((((((int)blockIdx.x) / 21) * 56448) + (((((int)threadIdx.x) + 384) / 41) * 1764)) + ((((int)blockIdx.x) % 21) * 84)) + ((((int)threadIdx.x) + 15) % 41)))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((int)blockIdx.x) / 21) * 56448) + (((((int)threadIdx.x) + 768) / 41) * 1764)) + ((((int)blockIdx.x) % 21) * 84)) + ((((int)threadIdx.x) + 30) % 41)))];\n if (((int)threadIdx.x) < 160) {\n compute_shared[((((int)threadIdx.x) + 1152))] = data[((((((((int)blockIdx.x) / 21) * 56448) + (((((int)threadIdx.x) + 1152) / 41) * 1764)) + ((((int)blockIdx.x) % 21) * 84)) + ((((int)threadIdx.x) + 4) % 41)))];\n }\n __syncthreads();\n Pool2d[((((((((int)blockIdx.x) / 21) * 14112) + ((((int)threadIdx.x) / 12) * 441)) + ((((int)blockIdx.x) % 21) * 21)) + (((int)threadIdx.x) % 12)))] = (Pool2d[((((((((int)blockIdx.x) / 21) * 14112) + ((((int)threadIdx.x) / 12) * 441)) + ((((int)blockIdx.x) % 21) * 21)) + (((int)threadIdx.x) % 12)))] + compute_shared[((((((int)threadIdx.x) / 12) * 41) + ((((int)threadIdx.x) % 12) * 2)))]);\n if ((((int)threadIdx.x) % 12) < 9) {\n Pool2d[(((((((((int)blockIdx.x) / 21) * 14112) + ((((int)threadIdx.x) / 12) * 441)) + ((((int)blockIdx.x) % 21) * 21)) + (((int)threadIdx.x) % 12)) + 12))] = (Pool2d[(((((((((int)blockIdx.x) / 21) * 14112) + ((((int)threadIdx.x) / 12) * 441)) + ((((int)blockIdx.x) % 21) * 21)) + (((int)threadIdx.x) % 12)) + 12))] + compute_shared[(((((((int)threadIdx.x) / 12) * 41) + ((((int)threadIdx.x) % 12) * 2)) + 24))]);\n }\n}\n", "gridDim": [84672, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,168,42,42]_[128,168,42,42].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,168,42,42]_[128,168,42,42].json new file mode 100644 index 000000000..66d66ed9a --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,168,42,42]_[128,168,42,42].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 168, 42, 42], "output_shape": [128, 168, 42, 42], "window_shape": [3, 3], "window_stride": [1, 1], "padding_below": [1, 1]}, "op_type": "AvgPool", "tvm_func_name": "roller_AvgPool__128_168_42_42___128_168_42_42_", "code": "extern \"C\" __global__ void roller_AvgPool__128_168_42_42___128_168_42_42_(float* __restrict__ Pool2d, float* __restrict__ data) {\n __shared__ float compute_shared[352];\n if ((((int)threadIdx.x) % 48) < 42) {\n Pool2d[((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 48) * 42)) + (((int)threadIdx.x) % 48)))] = 0.000000e+00f;\n }\n compute_shared[(((int)threadIdx.x))] = ((((1 <= (((((int)blockIdx.x) % 7) * 6) + (((int)threadIdx.x) / 44))) && (1 <= (((int)threadIdx.x) % 44))) && ((((int)threadIdx.x) % 44) < 43)) ? data[((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 44) * 42)) + (((int)threadIdx.x) % 44)))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 64) {\n compute_shared[((((int)threadIdx.x) + 288))] = (((((((((int)blockIdx.x) % 7) * 6) + ((((int)threadIdx.x) + 288) / 44)) < 43) && (1 <= ((((int)threadIdx.x) + 24) % 44))) && (((((int)threadIdx.x) + 24) % 44) < 43)) ? data[((((((int)blockIdx.x) * 252) + (((((int)threadIdx.x) + 288) / 44) * 42)) + ((((int)threadIdx.x) + 24) % 44)))] : 0.000000e+00f);\n }\n \n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 9; ++k_inner_outer) {\n if ((((int)threadIdx.x) % 48) < 42) {\n Pool2d[((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 48) * 42)) + (((int)threadIdx.x) % 48)))] = (Pool2d[((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 48) * 42)) + (((int)threadIdx.x) % 48)))] + (compute_shared[((((((((int)threadIdx.x) / 48) * 44) + ((k_inner_outer / 3) * 44)) + (((int)threadIdx.x) % 48)) + (k_inner_outer % 3)))] * 1.111111e-01f));\n }\n }\n}\n", "gridDim": [150528, 1, 1], "blockDim": [288, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,168,83,83]_[128,168,42,42].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,168,83,83]_[128,168,42,42].json new file mode 100644 index 000000000..582200880 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,168,83,83]_[128,168,42,42].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 168, 83, 83], "output_shape": [128, 168, 42, 42], "window_shape": [1, 1], "window_stride": [2, 2], "padding_below": [0, 0]}, "op_type": "AvgPool", "tvm_func_name": "roller_AvgPool__128_168_83_83___128_168_42_42_", "code": "extern \"C\" __global__ void roller_AvgPool__128_168_83_83___128_168_42_42_(float* __restrict__ Pool2d, float* __restrict__ data) {\n __shared__ float compute_shared[664];\n Pool2d[((((((((int)blockIdx.x) / 42) * 14112) + ((((int)threadIdx.x) / 24) * 1764)) + ((((int)blockIdx.x) % 42) * 42)) + (((int)threadIdx.x) % 24)))] = 0.000000e+00f;\n if ((((int)threadIdx.x) % 24) < 18) {\n Pool2d[(((((((((int)blockIdx.x) / 42) * 14112) + ((((int)threadIdx.x) / 24) * 1764)) + ((((int)blockIdx.x) % 42) * 42)) + (((int)threadIdx.x) % 24)) + 24))] = 0.000000e+00f;\n }\n compute_shared[(((int)threadIdx.x))] = data[((((((((int)blockIdx.x) / 42) * 55112) + ((((int)threadIdx.x) / 83) * 6889)) + ((((int)blockIdx.x) % 42) * 166)) + (((int)threadIdx.x) % 83)))];\n compute_shared[((((int)threadIdx.x) + 192))] = data[((((((((int)blockIdx.x) / 42) * 55112) + (((((int)threadIdx.x) + 192) / 83) * 6889)) + ((((int)blockIdx.x) % 42) * 166)) + ((((int)threadIdx.x) + 26) % 83)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[((((((((int)blockIdx.x) / 42) * 55112) + (((((int)threadIdx.x) + 384) / 83) * 6889)) + ((((int)blockIdx.x) % 42) * 166)) + ((((int)threadIdx.x) + 52) % 83)))];\n if (((int)threadIdx.x) < 88) {\n compute_shared[((((int)threadIdx.x) + 576))] = data[((((((((int)blockIdx.x) / 42) * 55112) + (((((int)threadIdx.x) + 576) / 83) * 6889)) + ((((int)blockIdx.x) % 42) * 166)) + ((((int)threadIdx.x) + 78) % 83)))];\n }\n __syncthreads();\n Pool2d[((((((((int)blockIdx.x) / 42) * 14112) + ((((int)threadIdx.x) / 24) * 1764)) + ((((int)blockIdx.x) % 42) * 42)) + (((int)threadIdx.x) % 24)))] = (Pool2d[((((((((int)blockIdx.x) / 42) * 14112) + ((((int)threadIdx.x) / 24) * 1764)) + ((((int)blockIdx.x) % 42) * 42)) + (((int)threadIdx.x) % 24)))] + compute_shared[((((((int)threadIdx.x) / 24) * 83) + ((((int)threadIdx.x) % 24) * 2)))]);\n if ((((int)threadIdx.x) % 24) < 18) {\n Pool2d[(((((((((int)blockIdx.x) / 42) * 14112) + ((((int)threadIdx.x) / 24) * 1764)) + ((((int)blockIdx.x) % 42) * 42)) + (((int)threadIdx.x) % 24)) + 24))] = (Pool2d[(((((((((int)blockIdx.x) / 42) * 14112) + ((((int)threadIdx.x) / 24) * 1764)) + ((((int)blockIdx.x) % 42) * 42)) + (((int)threadIdx.x) % 24)) + 24))] + compute_shared[(((((((int)threadIdx.x) / 24) * 83) + ((((int)threadIdx.x) % 24) * 2)) + 48))]);\n }\n}\n", "gridDim": [112896, 1, 1], "blockDim": [192, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,2016,21,21]_[128,2016,11,11].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,2016,21,21]_[128,2016,11,11].json new file mode 100644 index 000000000..a3dc9ae84 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,2016,21,21]_[128,2016,11,11].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 2016, 21, 21], "output_shape": [128, 2016, 11, 11], "window_shape": [1, 1], "window_stride": [2, 2], "padding_below": [0, 0]}, "op_type": "AvgPool", "tvm_func_name": "roller_AvgPool__128_2016_21_21___128_2016_11_11_", "code": "extern \"C\" __global__ void roller_AvgPool__128_2016_21_21___128_2016_11_11_(float* __restrict__ Pool2d, float* __restrict__ data) {\n __shared__ float compute_shared[882];\n if ((((int)threadIdx.x) & 15) < 11) {\n Pool2d[((((((int)blockIdx.x) * 242) + ((((int)threadIdx.x) >> 4) * 11)) + (((int)threadIdx.x) & 15)))] = 0.000000e+00f;\n }\n compute_shared[(((int)threadIdx.x))] = data[(((((int)blockIdx.x) * 882) + ((int)threadIdx.x)))];\n compute_shared[((((int)threadIdx.x) + 352))] = data[((((((int)blockIdx.x) * 882) + ((int)threadIdx.x)) + 352))];\n if (((int)threadIdx.x) < 178) {\n compute_shared[((((int)threadIdx.x) + 704))] = data[((((((int)blockIdx.x) * 882) + ((int)threadIdx.x)) + 704))];\n }\n __syncthreads();\n if ((((int)threadIdx.x) & 15) < 11) {\n Pool2d[((((((int)blockIdx.x) * 242) + ((((int)threadIdx.x) >> 4) * 11)) + (((int)threadIdx.x) & 15)))] = (Pool2d[((((((int)blockIdx.x) * 242) + ((((int)threadIdx.x) >> 4) * 11)) + (((int)threadIdx.x) & 15)))] + compute_shared[(((((((int)threadIdx.x) / 176) * 441) + (((((int)threadIdx.x) % 176) >> 4) * 42)) + ((((int)threadIdx.x) & 15) * 2)))]);\n }\n}\n", "gridDim": [129024, 1, 1], "blockDim": [352, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,336,21,21]_[128,336,21,21].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,336,21,21]_[128,336,21,21].json new file mode 100644 index 000000000..41603a1aa --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,336,21,21]_[128,336,21,21].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 336, 21, 21], "output_shape": [128, 336, 21, 21], "window_shape": [3, 3], "window_stride": [1, 1], "padding_below": [1, 1]}, "op_type": "AvgPool", "tvm_func_name": "roller_AvgPool__128_336_21_21___128_336_21_21_", "code": "extern \"C\" __global__ void roller_AvgPool__128_336_21_21___128_336_21_21_(float* __restrict__ Pool2d, float* __restrict__ data) {\n __shared__ float compute_shared[322];\n if ((((((int)blockIdx.x) & 1) * 12) + (((int)threadIdx.x) / 24)) < 21) {\n if ((((int)threadIdx.x) % 24) < 21) {\n Pool2d[((((((((int)blockIdx.x) >> 1) * 441) + ((((int)blockIdx.x) & 1) * 252)) + ((((int)threadIdx.x) / 24) * 21)) + (((int)threadIdx.x) % 24)))] = 0.000000e+00f;\n }\n }\n if ((((((int)blockIdx.x) & 1) * 12) + (((int)threadIdx.x) / 23)) < 23) {\n compute_shared[(((int)threadIdx.x))] = (((((1 <= (((((int)blockIdx.x) & 1) * 12) + (((int)threadIdx.x) / 23))) && ((((((int)blockIdx.x) & 1) * 12) + (((int)threadIdx.x) / 23)) < 22)) && (1 <= (((int)threadIdx.x) % 23))) && ((((int)threadIdx.x) % 23) < 22)) ? data[((((((((int)blockIdx.x) >> 1) * 441) + ((((int)blockIdx.x) & 1) * 252)) + ((((int)threadIdx.x) / 23) * 21)) + (((int)threadIdx.x) % 23)))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 34) {\n if ((((((int)blockIdx.x) & 1) * 12) + ((((int)threadIdx.x) + 288) / 23)) < 23) {\n compute_shared[((((int)threadIdx.x) + 288))] = (((((((((int)blockIdx.x) & 1) * 12) + ((((int)threadIdx.x) + 288) / 23)) < 22) && (1 <= ((((int)threadIdx.x) + 12) % 23))) && (((((int)threadIdx.x) + 12) % 23) < 22)) ? data[((((((((int)blockIdx.x) >> 1) * 441) + ((((int)blockIdx.x) & 1) * 252)) + (((((int)threadIdx.x) + 288) / 23) * 21)) + ((((int)threadIdx.x) + 12) % 23)))] : 0.000000e+00f);\n }\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 9; ++k_inner_outer) {\n if ((((((int)blockIdx.x) & 1) * 12) + (((int)threadIdx.x) / 24)) < 21) {\n if ((((int)threadIdx.x) % 24) < 21) {\n Pool2d[((((((((int)blockIdx.x) >> 1) * 441) + ((((int)blockIdx.x) & 1) * 252)) + ((((int)threadIdx.x) / 24) * 21)) + (((int)threadIdx.x) % 24)))] = (Pool2d[((((((((int)blockIdx.x) >> 1) * 441) + ((((int)blockIdx.x) & 1) * 252)) + ((((int)threadIdx.x) / 24) * 21)) + (((int)threadIdx.x) % 24)))] + (compute_shared[((((((((int)threadIdx.x) / 24) * 23) + ((k_inner_outer / 3) * 23)) + (((int)threadIdx.x) % 24)) + (k_inner_outer % 3)))] * 1.111111e-01f));\n }\n }\n }\n}\n", "gridDim": [86016, 1, 1], "blockDim": [288, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,336,42,42]_[128,336,21,21].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,336,42,42]_[128,336,21,21].json new file mode 100644 index 000000000..3db8fb69f --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,336,42,42]_[128,336,21,21].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 336, 42, 42], "output_shape": [128, 336, 21, 21], "window_shape": [3, 3], "window_stride": [2, 2], "padding_below": [0, 0]}, "op_type": "AvgPool", "tvm_func_name": "roller_AvgPool__128_336_42_42___128_336_21_21_", "code": "extern \"C\" __global__ void roller_AvgPool__128_336_42_42___128_336_21_21_(float* __restrict__ Pool2d, float* __restrict__ data) {\n __shared__ float compute_shared[1075];\n if ((((((int)blockIdx.x) & 1) * 12) + (((int)threadIdx.x) / 24)) < 21) {\n if ((((int)threadIdx.x) % 24) < 21) {\n Pool2d[((((((((int)blockIdx.x) >> 1) * 441) + ((((int)blockIdx.x) & 1) * 252)) + ((((int)threadIdx.x) / 24) * 21)) + (((int)threadIdx.x) % 24)))] = 0.000000e+00f;\n }\n }\n compute_shared[(((int)threadIdx.x))] = (((1 <= (((((int)blockIdx.x) & 1) * 24) + (((int)threadIdx.x) / 43))) && (1 <= (((int)threadIdx.x) % 43))) ? data[((((((((int)blockIdx.x) >> 1) * 1764) + ((((int)blockIdx.x) & 1) * 1008)) + ((((int)threadIdx.x) / 43) * 42)) + (((int)threadIdx.x) % 43)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 288))] = ((1 <= ((((int)threadIdx.x) + 30) % 43)) ? data[((((((((int)blockIdx.x) >> 1) * 1764) + ((((int)blockIdx.x) & 1) * 1008)) + (((((int)threadIdx.x) + 288) / 43) * 42)) + ((((int)threadIdx.x) + 30) % 43)))] : 0.000000e+00f);\n if ((((((int)blockIdx.x) & 1) * 24) + ((((int)threadIdx.x) + 576) / 43)) < 44) {\n compute_shared[((((int)threadIdx.x) + 576))] = ((((((((int)blockIdx.x) & 1) * 24) + ((((int)threadIdx.x) + 576) / 43)) < 43) && (1 <= ((((int)threadIdx.x) + 17) % 43))) ? data[((((((((int)blockIdx.x) >> 1) * 1764) + ((((int)blockIdx.x) & 1) * 1008)) + (((((int)threadIdx.x) + 576) / 43) * 42)) + ((((int)threadIdx.x) + 17) % 43)))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 211) {\n if ((((((int)blockIdx.x) & 1) * 24) + ((((int)threadIdx.x) + 864) / 43)) < 44) {\n compute_shared[((((int)threadIdx.x) + 864))] = ((((((((int)blockIdx.x) & 1) * 24) + ((((int)threadIdx.x) + 864) / 43)) < 43) && (1 <= ((((int)threadIdx.x) + 4) % 43))) ? data[((((((((int)blockIdx.x) >> 1) * 1764) + ((((int)blockIdx.x) & 1) * 1008)) + (((((int)threadIdx.x) + 864) / 43) * 42)) + ((((int)threadIdx.x) + 4) % 43)))] : 0.000000e+00f);\n }\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 9; ++k_inner_outer) {\n if ((((((int)blockIdx.x) & 1) * 12) + (((int)threadIdx.x) / 24)) < 21) {\n if ((((int)threadIdx.x) % 24) < 21) {\n Pool2d[((((((((int)blockIdx.x) >> 1) * 441) + ((((int)blockIdx.x) & 1) * 252)) + ((((int)threadIdx.x) / 24) * 21)) + (((int)threadIdx.x) % 24)))] = (Pool2d[((((((((int)blockIdx.x) >> 1) * 441) + ((((int)blockIdx.x) & 1) * 252)) + ((((int)threadIdx.x) / 24) * 21)) + (((int)threadIdx.x) % 24)))] + (compute_shared[((((((((int)threadIdx.x) / 24) * 86) + ((k_inner_outer / 3) * 43)) + ((((int)threadIdx.x) % 24) * 2)) + (k_inner_outer % 3)))] * 1.111111e-01f));\n }\n }\n }\n}\n", "gridDim": [86016, 1, 1], "blockDim": [288, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,42,165,165]_[128,42,83,83].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,42,165,165]_[128,42,83,83].json new file mode 100644 index 000000000..1205623bb --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,42,165,165]_[128,42,83,83].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 42, 165, 165], "output_shape": [128, 42, 83, 83], "window_shape": [3, 3], "window_stride": [2, 2], "padding_below": [1, 1]}, "op_type": "AvgPool", "tvm_func_name": "roller_AvgPool__128_42_165_165___128_42_83_83_", "code": "extern \"C\" __global__ void roller_AvgPool__128_42_165_165___128_42_83_83_(float* __restrict__ Pool2d, float* __restrict__ data) {\n __shared__ float compute_shared[1503];\n if ((((((int)blockIdx.x) % 21) * 4) + (((int)threadIdx.x) / 96)) < 83) {\n if ((((int)threadIdx.x) % 96) < 83) {\n Pool2d[((((((((int)blockIdx.x) / 21) * 6889) + ((((int)blockIdx.x) % 21) * 332)) + ((((int)threadIdx.x) / 96) * 83)) + (((int)threadIdx.x) % 96)))] = 0.000000e+00f;\n }\n }\n compute_shared[(((int)threadIdx.x))] = ((((1 <= (((((int)blockIdx.x) % 21) * 8) + (((int)threadIdx.x) / 167))) && (1 <= (((int)threadIdx.x) % 167))) && ((((int)threadIdx.x) % 167) < 166)) ? data[((((((((int)blockIdx.x) / 21) * 27225) + ((((int)blockIdx.x) % 21) * 1320)) + ((((int)threadIdx.x) / 167) * 165)) + (((int)threadIdx.x) % 167)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 384))] = (((1 <= ((((int)threadIdx.x) + 50) % 167)) && (((((int)threadIdx.x) + 50) % 167) < 166)) ? data[((((((((int)blockIdx.x) / 21) * 27225) + ((((int)blockIdx.x) % 21) * 1320)) + (((((int)threadIdx.x) + 384) / 167) * 165)) + ((((int)threadIdx.x) + 50) % 167)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 768))] = (((((((((int)blockIdx.x) % 21) * 8) + ((((int)threadIdx.x) + 768) / 167)) < 166) && (1 <= ((((int)threadIdx.x) + 100) % 167))) && (((((int)threadIdx.x) + 100) % 167) < 166)) ? data[((((((((int)blockIdx.x) / 21) * 27225) + ((((int)blockIdx.x) % 21) * 1320)) + (((((int)threadIdx.x) + 768) / 167) * 165)) + ((((int)threadIdx.x) + 100) % 167)))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 351) {\n if ((((((int)blockIdx.x) % 21) * 8) + ((((int)threadIdx.x) + 1152) / 167)) < 167) {\n compute_shared[((((int)threadIdx.x) + 1152))] = (((((((((int)blockIdx.x) % 21) * 8) + ((((int)threadIdx.x) + 1152) / 167)) < 166) && (1 <= ((((int)threadIdx.x) + 150) % 167))) && (((((int)threadIdx.x) + 150) % 167) < 166)) ? data[((((((((int)blockIdx.x) / 21) * 27225) + ((((int)blockIdx.x) % 21) * 1320)) + (((((int)threadIdx.x) + 1152) / 167) * 165)) + ((((int)threadIdx.x) + 150) % 167)))] : 0.000000e+00f);\n }\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 9; ++k_inner_outer) {\n if ((((((int)blockIdx.x) % 21) * 4) + (((int)threadIdx.x) / 96)) < 83) {\n if ((((int)threadIdx.x) % 96) < 83) {\n Pool2d[((((((((int)blockIdx.x) / 21) * 6889) + ((((int)blockIdx.x) % 21) * 332)) + ((((int)threadIdx.x) / 96) * 83)) + (((int)threadIdx.x) % 96)))] = (Pool2d[((((((((int)blockIdx.x) / 21) * 6889) + ((((int)blockIdx.x) % 21) * 332)) + ((((int)threadIdx.x) / 96) * 83)) + (((int)threadIdx.x) % 96)))] + (compute_shared[((((((((int)threadIdx.x) / 96) * 334) + ((k_inner_outer / 3) * 167)) + ((((int)threadIdx.x) % 96) * 2)) + (k_inner_outer % 3)))] * 1.111111e-01f));\n }\n }\n }\n}\n", "gridDim": [112896, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,42,83,83]_[128,42,83,83].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,42,83,83]_[128,42,83,83].json new file mode 100644 index 000000000..2e441b543 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,42,83,83]_[128,42,83,83].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 42, 83, 83], "output_shape": [128, 42, 83, 83], "window_shape": [3, 3], "window_stride": [1, 1], "padding_below": [1, 1]}, "op_type": "AvgPool", "tvm_func_name": "roller_AvgPool__128_42_83_83___128_42_83_83_", "code": "extern \"C\" __global__ void roller_AvgPool__128_42_83_83___128_42_83_83_(float* __restrict__ Pool2d, float* __restrict__ data) {\n __shared__ float compute_shared[510];\n if ((((((int)blockIdx.x) % 21) * 4) + (((int)threadIdx.x) / 88)) < 83) {\n if ((((int)threadIdx.x) % 88) < 83) {\n Pool2d[((((((((int)blockIdx.x) / 21) * 6889) + ((((int)blockIdx.x) % 21) * 332)) + ((((int)threadIdx.x) / 88) * 83)) + (((int)threadIdx.x) % 88)))] = 0.000000e+00f;\n }\n }\n compute_shared[(((int)threadIdx.x))] = (((((1 <= (((((int)blockIdx.x) % 21) * 4) + (((int)threadIdx.x) / 85))) && ((((((int)blockIdx.x) % 21) * 4) + (((int)threadIdx.x) / 85)) < 84)) && (1 <= (((int)threadIdx.x) % 85))) && ((((int)threadIdx.x) % 85) < 84)) ? data[((((((((int)blockIdx.x) / 21) * 6889) + ((((int)blockIdx.x) % 21) * 332)) + ((((int)threadIdx.x) / 85) * 83)) + (((int)threadIdx.x) % 85)))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 158) {\n if ((((((int)blockIdx.x) % 21) * 4) + ((((int)threadIdx.x) + 352) / 85)) < 85) {\n compute_shared[((((int)threadIdx.x) + 352))] = (((((((((int)blockIdx.x) % 21) * 4) + ((((int)threadIdx.x) + 352) / 85)) < 84) && (1 <= ((((int)threadIdx.x) + 12) % 85))) && (((((int)threadIdx.x) + 12) % 85) < 84)) ? data[((((((((int)blockIdx.x) / 21) * 6889) + ((((int)blockIdx.x) % 21) * 332)) + (((((int)threadIdx.x) + 352) / 85) * 83)) + ((((int)threadIdx.x) + 12) % 85)))] : 0.000000e+00f);\n }\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 9; ++k_inner_outer) {\n if ((((((int)blockIdx.x) % 21) * 4) + (((int)threadIdx.x) / 88)) < 83) {\n if ((((int)threadIdx.x) % 88) < 83) {\n Pool2d[((((((((int)blockIdx.x) / 21) * 6889) + ((((int)blockIdx.x) % 21) * 332)) + ((((int)threadIdx.x) / 88) * 83)) + (((int)threadIdx.x) % 88)))] = (Pool2d[((((((((int)blockIdx.x) / 21) * 6889) + ((((int)blockIdx.x) % 21) * 332)) + ((((int)threadIdx.x) / 88) * 83)) + (((int)threadIdx.x) % 88)))] + (compute_shared[((((((((int)threadIdx.x) / 88) * 85) + ((k_inner_outer / 3) * 85)) + (((int)threadIdx.x) % 88)) + (k_inner_outer % 3)))] * 1.111111e-01f));\n }\n }\n }\n}\n", "gridDim": [112896, 1, 1], "blockDim": [352, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,672,11,11]_[128,672,11,11].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,672,11,11]_[128,672,11,11].json new file mode 100644 index 000000000..04e6dc056 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,672,11,11]_[128,672,11,11].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 672, 11, 11], "output_shape": [128, 672, 11, 11], "window_shape": [3, 3], "window_stride": [1, 1], "padding_below": [1, 1]}, "op_type": "AvgPool", "tvm_func_name": "roller_AvgPool__128_672_11_11___128_672_11_11_", "code": "extern \"C\" __global__ void roller_AvgPool__128_672_11_11___128_672_11_11_(float* __restrict__ Pool2d, float* __restrict__ data) {\n __shared__ float compute_shared[169];\n if (((int)threadIdx.x) < 176) {\n if ((((int)threadIdx.x) & 15) < 11) {\n Pool2d[((((((int)blockIdx.x) * 121) + ((((int)threadIdx.x) >> 4) * 11)) + (((int)threadIdx.x) & 15)))] = 0.000000e+00f;\n }\n }\n if (((int)threadIdx.x) < 169) {\n compute_shared[(((int)threadIdx.x))] = (((((13 <= ((int)threadIdx.x)) && (((int)threadIdx.x) < 156)) && (1 <= (((int)threadIdx.x) % 13))) && ((((int)threadIdx.x) % 13) < 12)) ? data[((((((int)blockIdx.x) * 121) + ((((int)threadIdx.x) / 13) * 11)) + (((int)threadIdx.x) % 13)))] : 0.000000e+00f);\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 9; ++k_inner_outer) {\n if (((int)threadIdx.x) < 176) {\n if ((((int)threadIdx.x) & 15) < 11) {\n Pool2d[((((((int)blockIdx.x) * 121) + ((((int)threadIdx.x) >> 4) * 11)) + (((int)threadIdx.x) & 15)))] = (Pool2d[((((((int)blockIdx.x) * 121) + ((((int)threadIdx.x) >> 4) * 11)) + (((int)threadIdx.x) & 15)))] + (compute_shared[((((((((int)threadIdx.x) >> 4) * 13) + ((k_inner_outer / 3) * 13)) + (((int)threadIdx.x) & 15)) + (k_inner_outer % 3)))] * 1.111111e-01f));\n }\n }\n }\n}\n", "gridDim": [86016, 1, 1], "blockDim": [192, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,672,21,21]_[128,672,11,11].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,672,21,21]_[128,672,11,11].json new file mode 100644 index 000000000..f13d5b1b7 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,672,21,21]_[128,672,11,11].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 672, 21, 21], "output_shape": [128, 672, 11, 11], "window_shape": [3, 3], "window_stride": [2, 2], "padding_below": [1, 1]}, "op_type": "AvgPool", "tvm_func_name": "roller_AvgPool__128_672_21_21___128_672_11_11_", "code": "extern \"C\" __global__ void roller_AvgPool__128_672_21_21___128_672_11_11_(float* __restrict__ Pool2d, float* __restrict__ data) {\n __shared__ float compute_shared[299];\n if ((((((int)blockIdx.x) & 1) * 6) + (((int)threadIdx.x) >> 4)) < 11) {\n if ((((int)threadIdx.x) & 15) < 11) {\n Pool2d[((((((((int)blockIdx.x) >> 1) * 121) + ((((int)blockIdx.x) & 1) * 66)) + ((((int)threadIdx.x) >> 4) * 11)) + (((int)threadIdx.x) & 15)))] = 0.000000e+00f;\n }\n }\n compute_shared[(((int)threadIdx.x))] = ((((1 <= (((((int)blockIdx.x) & 1) * 12) + (((int)threadIdx.x) / 23))) && (1 <= (((int)threadIdx.x) % 23))) && ((((int)threadIdx.x) % 23) < 22)) ? data[((((((((int)blockIdx.x) >> 1) * 441) + ((((int)blockIdx.x) & 1) * 252)) + ((((int)threadIdx.x) / 23) * 21)) + (((int)threadIdx.x) % 23)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 96))] = (((1 <= ((((int)threadIdx.x) + 4) % 23)) && (((((int)threadIdx.x) + 4) % 23) < 22)) ? data[((((((((int)blockIdx.x) >> 1) * 441) + ((((int)blockIdx.x) & 1) * 252)) + (((((int)threadIdx.x) + 96) / 23) * 21)) + ((((int)threadIdx.x) + 4) % 23)))] : 0.000000e+00f);\n if ((((((int)blockIdx.x) & 1) * 12) + ((((int)threadIdx.x) + 192) / 23)) < 23) {\n compute_shared[((((int)threadIdx.x) + 192))] = (((((((((int)blockIdx.x) & 1) * 12) + ((((int)threadIdx.x) + 192) / 23)) < 22) && (1 <= ((((int)threadIdx.x) + 8) % 23))) && (((((int)threadIdx.x) + 8) % 23) < 22)) ? data[((((((((int)blockIdx.x) >> 1) * 441) + ((((int)blockIdx.x) & 1) * 252)) + (((((int)threadIdx.x) + 192) / 23) * 21)) + ((((int)threadIdx.x) + 8) % 23)))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 11) {\n if ((((((int)blockIdx.x) & 1) * 12) + ((((int)threadIdx.x) + 288) / 23)) < 23) {\n compute_shared[((((int)threadIdx.x) + 288))] = ((((((((int)blockIdx.x) & 1) * 12) + ((((int)threadIdx.x) + 288) / 23)) < 22) && (((int)threadIdx.x) < 10)) ? data[((((((((int)blockIdx.x) >> 1) * 441) + ((((int)blockIdx.x) & 1) * 252)) + (((((int)threadIdx.x) + 288) / 23) * 21)) + (((int)threadIdx.x) + 12)))] : 0.000000e+00f);\n }\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 9; ++k_inner_outer) {\n if ((((((int)blockIdx.x) & 1) * 6) + (((int)threadIdx.x) >> 4)) < 11) {\n if ((((int)threadIdx.x) & 15) < 11) {\n Pool2d[((((((((int)blockIdx.x) >> 1) * 121) + ((((int)blockIdx.x) & 1) * 66)) + ((((int)threadIdx.x) >> 4) * 11)) + (((int)threadIdx.x) & 15)))] = (Pool2d[((((((((int)blockIdx.x) >> 1) * 121) + ((((int)blockIdx.x) & 1) * 66)) + ((((int)threadIdx.x) >> 4) * 11)) + (((int)threadIdx.x) & 15)))] + (compute_shared[((((((((int)threadIdx.x) >> 4) * 46) + ((k_inner_outer / 3) * 23)) + ((((int)threadIdx.x) & 15) * 2)) + (k_inner_outer % 3)))] * 1.111111e-01f));\n }\n }\n }\n}\n", "gridDim": [172032, 1, 1], "blockDim": [96, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,84,42,42]_[128,84,42,42].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,84,42,42]_[128,84,42,42].json new file mode 100644 index 000000000..fa0997473 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,84,42,42]_[128,84,42,42].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 84, 42, 42], "output_shape": [128, 84, 42, 42], "window_shape": [3, 3], "window_stride": [1, 1], "padding_below": [1, 1]}, "op_type": "AvgPool", "tvm_func_name": "roller_AvgPool__128_84_42_42___128_84_42_42_", "code": "extern \"C\" __global__ void roller_AvgPool__128_84_42_42___128_84_42_42_(float* __restrict__ Pool2d, float* __restrict__ data) {\n __shared__ float compute_shared[352];\n if ((((int)threadIdx.x) % 48) < 42) {\n Pool2d[((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 48) * 42)) + (((int)threadIdx.x) % 48)))] = 0.000000e+00f;\n }\n compute_shared[(((int)threadIdx.x))] = ((((1 <= (((((int)blockIdx.x) % 7) * 6) + (((int)threadIdx.x) / 44))) && (1 <= (((int)threadIdx.x) % 44))) && ((((int)threadIdx.x) % 44) < 43)) ? data[((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 44) * 42)) + (((int)threadIdx.x) % 44)))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 64) {\n compute_shared[((((int)threadIdx.x) + 288))] = (((((((((int)blockIdx.x) % 7) * 6) + ((((int)threadIdx.x) + 288) / 44)) < 43) && (1 <= ((((int)threadIdx.x) + 24) % 44))) && (((((int)threadIdx.x) + 24) % 44) < 43)) ? data[((((((int)blockIdx.x) * 252) + (((((int)threadIdx.x) + 288) / 44) * 42)) + ((((int)threadIdx.x) + 24) % 44)))] : 0.000000e+00f);\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 9; ++k_inner_outer) {\n if ((((int)threadIdx.x) % 48) < 42) {\n Pool2d[((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 48) * 42)) + (((int)threadIdx.x) % 48)))] = (Pool2d[((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 48) * 42)) + (((int)threadIdx.x) % 48)))] + (compute_shared[((((((((int)threadIdx.x) / 48) * 44) + ((k_inner_outer / 3) * 44)) + (((int)threadIdx.x) % 48)) + (k_inner_outer % 3)))] * 1.111111e-01f));\n }\n }\n}\n", "gridDim": [75264, 1, 1], "blockDim": [288, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,84,83,83]_[128,84,42,42].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,84,83,83]_[128,84,42,42].json new file mode 100644 index 000000000..cd6a3dae3 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,84,83,83]_[128,84,42,42].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 84, 83, 83], "output_shape": [128, 84, 42, 42], "window_shape": [3, 3], "window_stride": [2, 2], "padding_below": [1, 1]}, "op_type": "AvgPool", "tvm_func_name": "roller_AvgPool__128_84_83_83___128_84_42_42_", "code": "extern \"C\" __global__ void roller_AvgPool__128_84_83_83___128_84_42_42_(float* __restrict__ Pool2d, float* __restrict__ data) {\n __shared__ float compute_shared[425];\n if ((((int)threadIdx.x) % 48) < 42) {\n Pool2d[((((((int)blockIdx.x) * 84) + ((((int)threadIdx.x) / 48) * 42)) + (((int)threadIdx.x) % 48)))] = 0.000000e+00f;\n }\n compute_shared[(((int)threadIdx.x))] = ((((1 <= (((((int)blockIdx.x) % 21) * 4) + (((int)threadIdx.x) / 85))) && (1 <= (((int)threadIdx.x) % 85))) && ((((int)threadIdx.x) % 85) < 84)) ? data[((((((((int)blockIdx.x) / 21) * 6889) + ((((int)blockIdx.x) % 21) * 332)) + ((((int)threadIdx.x) / 85) * 83)) + (((int)threadIdx.x) % 85)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 96))] = (((1 <= ((((int)threadIdx.x) + 11) % 85)) && (((((int)threadIdx.x) + 11) % 85) < 84)) ? data[((((((((int)blockIdx.x) / 21) * 6889) + ((((int)blockIdx.x) % 21) * 332)) + (((((int)threadIdx.x) + 96) / 85) * 83)) + ((((int)threadIdx.x) + 11) % 85)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 192))] = (((1 <= ((((int)threadIdx.x) + 22) % 85)) && (((((int)threadIdx.x) + 22) % 85) < 84)) ? data[((((((((int)blockIdx.x) / 21) * 6889) + ((((int)blockIdx.x) % 21) * 332)) + (((((int)threadIdx.x) + 192) / 85) * 83)) + ((((int)threadIdx.x) + 22) % 85)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 288))] = (((((((((int)blockIdx.x) % 21) * 4) + ((((int)threadIdx.x) + 288) / 85)) < 84) && (1 <= ((((int)threadIdx.x) + 33) % 85))) && (((((int)threadIdx.x) + 33) % 85) < 84)) ? data[((((((((int)blockIdx.x) / 21) * 6889) + ((((int)blockIdx.x) % 21) * 332)) + (((((int)threadIdx.x) + 288) / 85) * 83)) + ((((int)threadIdx.x) + 33) % 85)))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 41) {\n compute_shared[((((int)threadIdx.x) + 384))] = ((((((((int)blockIdx.x) % 21) * 4) + ((((int)threadIdx.x) + 384) / 85)) < 84) && (((int)threadIdx.x) < 40)) ? data[((((((((int)blockIdx.x) / 21) * 6889) + ((((int)blockIdx.x) % 21) * 332)) + (((((int)threadIdx.x) + 384) / 85) * 83)) + (((int)threadIdx.x) + 44)))] : 0.000000e+00f);\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 9; ++k_inner_outer) {\n if ((((int)threadIdx.x) % 48) < 42) {\n Pool2d[((((((int)blockIdx.x) * 84) + ((((int)threadIdx.x) / 48) * 42)) + (((int)threadIdx.x) % 48)))] = (Pool2d[((((((int)blockIdx.x) * 84) + ((((int)threadIdx.x) / 48) * 42)) + (((int)threadIdx.x) % 48)))] + (compute_shared[((((((((int)threadIdx.x) / 48) * 170) + ((k_inner_outer / 3) * 85)) + ((((int)threadIdx.x) % 48) * 2)) + (k_inner_outer % 3)))] * 1.111111e-01f));\n }\n }\n}\n", "gridDim": [225792, 1, 1], "blockDim": [96, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,96,165,165]_[128,96,83,83].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,96,165,165]_[128,96,83,83].json new file mode 100644 index 000000000..c31797f7e --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_AvgPool_[128,96,165,165]_[128,96,83,83].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 96, 165, 165], "output_shape": [128, 96, 83, 83], "window_shape": [1, 1], "window_stride": [2, 2], "padding_below": [0, 0]}, "op_type": "AvgPool", "tvm_func_name": "roller_AvgPool__128_96_165_165___128_96_83_83_", "code": "extern \"C\" __global__ void roller_AvgPool__128_96_165_165___128_96_83_83_(float* __restrict__ Pool2d, float* __restrict__ data) {\n __shared__ float compute_shared[660];\n Pool2d[((((((((int)blockIdx.x) / 83) * 27556) + ((((int)threadIdx.x) / 48) * 6889)) + ((((int)blockIdx.x) % 83) * 83)) + (((int)threadIdx.x) % 48)))] = 0.000000e+00f;\n if ((((int)threadIdx.x) % 48) < 35) {\n Pool2d[(((((((((int)blockIdx.x) / 83) * 27556) + ((((int)threadIdx.x) / 48) * 6889)) + ((((int)blockIdx.x) % 83) * 83)) + (((int)threadIdx.x) % 48)) + 48))] = 0.000000e+00f;\n }\n compute_shared[(((int)threadIdx.x))] = data[((((((((int)blockIdx.x) / 83) * 108900) + ((((int)threadIdx.x) / 165) * 27225)) + ((((int)blockIdx.x) % 83) * 330)) + (((int)threadIdx.x) % 165)))];\n compute_shared[((((int)threadIdx.x) + 192))] = data[((((((((int)blockIdx.x) / 83) * 108900) + (((((int)threadIdx.x) + 192) / 165) * 27225)) + ((((int)blockIdx.x) % 83) * 330)) + ((((int)threadIdx.x) + 27) % 165)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[((((((((int)blockIdx.x) / 83) * 108900) + (((((int)threadIdx.x) + 384) / 165) * 27225)) + ((((int)blockIdx.x) % 83) * 330)) + ((((int)threadIdx.x) + 54) % 165)))];\n if (((int)threadIdx.x) < 84) {\n compute_shared[((((int)threadIdx.x) + 576))] = data[((((((((int)blockIdx.x) / 83) * 108900) + (((((int)threadIdx.x) + 576) / 165) * 27225)) + ((((int)blockIdx.x) % 83) * 330)) + (((int)threadIdx.x) + 81)))];\n }\n __syncthreads();\n Pool2d[((((((((int)blockIdx.x) / 83) * 27556) + ((((int)threadIdx.x) / 48) * 6889)) + ((((int)blockIdx.x) % 83) * 83)) + (((int)threadIdx.x) % 48)))] = (Pool2d[((((((((int)blockIdx.x) / 83) * 27556) + ((((int)threadIdx.x) / 48) * 6889)) + ((((int)blockIdx.x) % 83) * 83)) + (((int)threadIdx.x) % 48)))] + compute_shared[((((((int)threadIdx.x) / 48) * 165) + ((((int)threadIdx.x) % 48) * 2)))]);\n if ((((int)threadIdx.x) % 48) < 35) {\n Pool2d[(((((((((int)blockIdx.x) / 83) * 27556) + ((((int)threadIdx.x) / 48) * 6889)) + ((((int)blockIdx.x) % 83) * 83)) + (((int)threadIdx.x) % 48)) + 48))] = (Pool2d[(((((((((int)blockIdx.x) / 83) * 27556) + ((((int)threadIdx.x) / 48) * 6889)) + ((((int)blockIdx.x) % 83) * 83)) + (((int)threadIdx.x) % 48)) + 48))] + compute_shared[(((((((int)threadIdx.x) / 48) * 165) + ((((int)threadIdx.x) % 48) * 2)) + 96))]);\n }\n}\n", "gridDim": [254976, 1, 1], "blockDim": [192, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,1008,21,21]_[168,1008,1,1]_[128,168,21,21].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,1008,21,21]_[168,1008,1,1]_[128,168,21,21].json new file mode 100644 index 000000000..b6fcf0da1 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,1008,21,21]_[168,1008,1,1]_[128,168,21,21].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 1008, 21, 21], "filter_shape": [168, 1008, 1, 1], "output_shape": [128, 168, 21, 21], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Convolution", "tvm_func_name": "roller_Convolution__128_1008_21_21___168_1008_1_1___128_168_21_21_", "code": "extern \"C\" __global__ void roller_Convolution__128_1008_21_21___168_1008_1_1___128_168_21_21_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float compute_local[64];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[6144];\n float compute_shared_local[8];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 32; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 441) * 444528) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 441)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 441) * 444528) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 1323))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 441) * 444528) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 2646))];\n compute_shared[((((int)threadIdx.x) + 1152))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 441) * 444528) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 3969))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 441) * 444528) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 5292))];\n compute_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 993) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 441) * 444528) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 6615))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 990) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 441) * 444528) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 7938))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2688))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 987) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 441) * 444528) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 9261))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 984) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 441) * 444528) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 10584))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3456))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 981) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 441) * 444528) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 11907))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 256) {\n compute_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 978) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 441) * 444528) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 13230))] : 0.000000e+00f);\n }\n compute_d_shared[(((int)threadIdx.x))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[(((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 384))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12096))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 768))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24192))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1152))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 36288))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1536))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 48384))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 60480))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 72576))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2688))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 84672))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 96768))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3456))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 108864))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 120960))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 4224))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 133056))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 4608))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 145152))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 4992))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 157248))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 5376))] = 0.000000e+00f;\n compute_d_shared[((((int)threadIdx.x) + 5760))] = 0.000000e+00f;\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 15)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 16))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 32))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 48))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 64))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 80))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 96))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 112))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2304))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3840))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 4608))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 5376))];\n if (((k_outer * 32) + k_inner_outer) < 1008) {\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(2)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(4)] * compute_d_shared_local[(3)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(4)] * compute_d_shared_local[(4)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(4)] * compute_d_shared_local[(5)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(4)] * compute_d_shared_local[(6)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(4)] * compute_d_shared_local[(7)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(2)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(5)] * compute_d_shared_local[(3)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(5)] * compute_d_shared_local[(4)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(5)] * compute_d_shared_local[(5)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(5)] * compute_d_shared_local[(6)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(5)] * compute_d_shared_local[(7)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(2)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(6)] * compute_d_shared_local[(3)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(6)] * compute_d_shared_local[(4)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(6)] * compute_d_shared_local[(5)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(6)] * compute_d_shared_local[(6)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(6)] * compute_d_shared_local[(7)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(2)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(7)] * compute_d_shared_local[(3)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(7)] * compute_d_shared_local[(4)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(7)] * compute_d_shared_local[(5)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(7)] * compute_d_shared_local[(6)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(7)] * compute_d_shared_local[(7)]));\n }\n }\n }\n compute[(((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)))] = compute_local[(0)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 16))] = compute_local[(1)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32))] = compute_local[(2)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 48))] = compute_local[(3)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 64))] = compute_local[(4)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 80))] = compute_local[(5)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 96))] = compute_local[(6)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 112))] = compute_local[(7)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1354752))] = compute_local[(8)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1354768))] = compute_local[(9)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1354784))] = compute_local[(10)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1354800))] = compute_local[(11)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1354816))] = compute_local[(12)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1354832))] = compute_local[(13)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1354848))] = compute_local[(14)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1354864))] = compute_local[(15)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 2709504))] = compute_local[(16)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 2709520))] = compute_local[(17)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 2709536))] = compute_local[(18)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 2709552))] = compute_local[(19)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 2709568))] = compute_local[(20)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 2709584))] = compute_local[(21)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 2709600))] = compute_local[(22)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 2709616))] = compute_local[(23)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4064256))] = compute_local[(24)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4064272))] = compute_local[(25)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4064288))] = compute_local[(26)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4064304))] = compute_local[(27)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4064320))] = compute_local[(28)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4064336))] = compute_local[(29)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4064352))] = compute_local[(30)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4064368))] = compute_local[(31)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 5419008))] = compute_local[(32)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 5419024))] = compute_local[(33)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 5419040))] = compute_local[(34)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 5419056))] = compute_local[(35)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 5419072))] = compute_local[(36)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 5419088))] = compute_local[(37)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 5419104))] = compute_local[(38)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 5419120))] = compute_local[(39)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6773760))] = compute_local[(40)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6773776))] = compute_local[(41)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6773792))] = compute_local[(42)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6773808))] = compute_local[(43)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6773824))] = compute_local[(44)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6773840))] = compute_local[(45)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6773856))] = compute_local[(46)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6773872))] = compute_local[(47)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8128512))] = compute_local[(48)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8128528))] = compute_local[(49)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8128544))] = compute_local[(50)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8128560))] = compute_local[(51)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8128576))] = compute_local[(52)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8128592))] = compute_local[(53)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8128608))] = compute_local[(54)];\n compute[((((((((int)threadIdx.x) >> 4) * 56448) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8128624))] = compute_local[(55)];\n}\n", "gridDim": [441, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,1008,42,42]_[168,1008,1,1]_[128,168,42,42].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,1008,42,42]_[168,1008,1,1]_[128,168,42,42].json new file mode 100644 index 000000000..f4598c731 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,1008,42,42]_[168,1008,1,1]_[128,168,42,42].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 1008, 42, 42], "filter_shape": [168, 1008, 1, 1], "output_shape": [128, 168, 42, 42], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_1008_42_42___168_1008_1_1___128_168_42_42_", "code": "extern \"C\" __global__ void roller_Convolution__128_1008_42_42___168_1008_1_1___128_168_42_42_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[6144];\n float compute_shared_local[4];\n float compute_d_shared_local[16];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 32; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 1778112) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 1778112) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 5292))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 1778112) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 10584))];\n compute_shared[((((int)threadIdx.x) + 1152))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 1778112) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 15876))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 1778112) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 21168))];\n compute_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 993) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 1778112) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 26460))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 990) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 1778112) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 31752))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2688))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 987) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 1778112) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 37044))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 984) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 1778112) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 42336))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3456))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 981) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 1778112) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 47628))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 256) {\n compute_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 978) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 1778112) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 52920))] : 0.000000e+00f);\n }\n compute_d_shared[(((int)threadIdx.x))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[(((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 384))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12096))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 768))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24192))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1152))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 36288))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1536))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 48384))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 60480))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 72576))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2688))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 84672))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 96768))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3456))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 108864))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 120960))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 4224))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 133056))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 4608))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 145152))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 4992))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 1008) ? kernel[((((((((int)threadIdx.x) >> 5) * 1008) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 157248))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 5376))] = 0.000000e+00f;\n compute_d_shared[((((int)threadIdx.x) + 5760))] = 0.000000e+00f;\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 384))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1152))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1920))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2304))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2688))];\n compute_d_shared_local[(8)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(9)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3456))];\n compute_d_shared_local[(10)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3840))];\n compute_d_shared_local[(11)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 4224))];\n compute_d_shared_local[(12)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 4608))];\n compute_d_shared_local[(13)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 4992))];\n compute_d_shared_local[(14)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 5376))];\n compute_d_shared_local[(15)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 5760))];\n if (((k_outer * 32) + k_inner_outer) < 1008) {\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(8)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(0)] * compute_d_shared_local[(9)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(10)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(0)] * compute_d_shared_local[(11)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(12)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(0)] * compute_d_shared_local[(13)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(14)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(0)] * compute_d_shared_local[(15)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(8)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(1)] * compute_d_shared_local[(9)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(10)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(1)] * compute_d_shared_local[(11)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(12)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(1)] * compute_d_shared_local[(13)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(14)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(1)] * compute_d_shared_local[(15)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(8)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(2)] * compute_d_shared_local[(9)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(10)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(2)] * compute_d_shared_local[(11)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(12)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(2)] * compute_d_shared_local[(13)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(14)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(2)] * compute_d_shared_local[(15)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(8)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(3)] * compute_d_shared_local[(9)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(10)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(3)] * compute_d_shared_local[(11)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(12)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(3)] * compute_d_shared_local[(13)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(14)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(3)] * compute_d_shared_local[(15)]));\n }\n }\n }\n compute[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))] = (compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))] = (compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))] = (compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))] = (compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419008))] = (compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419008))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419040))] = (compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419040))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419072))] = (compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419072))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419104))] = (compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419104))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128512))] = (compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128512))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128544))] = (compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128544))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128576))] = (compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128576))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128608))] = (compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128608))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838016))] = (compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838016))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838048))] = (compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838048))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838080))] = (compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838080))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838112))] = (compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838112))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547520))] = (compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547520))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547552))] = (compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547552))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547584))] = (compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547584))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547616))] = (compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547616))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257024))] = (compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257024))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257056))] = (compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257056))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257088))] = (compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257088))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257120))] = (compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257120))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966528))] = (compute_local[(28)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966528))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966560))] = (compute_local[(29)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966560))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966592))] = (compute_local[(30)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966592))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966624))] = (compute_local[(31)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966624))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676032))] = (compute_local[(32)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676032))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676064))] = (compute_local[(33)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676064))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676096))] = (compute_local[(34)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676096))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676128))] = (compute_local[(35)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676128))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385536))] = (compute_local[(36)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385536))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385568))] = (compute_local[(37)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385568))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385600))] = (compute_local[(38)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385600))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385632))] = (compute_local[(39)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385632))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095040))] = (compute_local[(40)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095040))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095072))] = (compute_local[(41)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095072))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095104))] = (compute_local[(42)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095104))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095136))] = (compute_local[(43)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095136))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804544))] = (compute_local[(44)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804544))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804576))] = (compute_local[(45)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804576))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804608))] = (compute_local[(46)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804608))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804640))] = (compute_local[(47)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804640))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514048))] = (compute_local[(48)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514048))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514080))] = (compute_local[(49)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514080))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514112))] = (compute_local[(50)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514112))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514144))] = (compute_local[(51)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514144))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223552))] = (compute_local[(52)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223552))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223584))] = (compute_local[(53)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223584))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223616))] = (compute_local[(54)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223616))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223648))] = (compute_local[(55)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223648))]);\n}\n", "gridDim": [1764, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,1008,42,42]_[336,1008,1,1]_[128,336,42,42].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,1008,42,42]_[336,1008,1,1]_[128,336,42,42].json new file mode 100644 index 000000000..a3c941029 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,1008,42,42]_[336,1008,1,1]_[128,336,42,42].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 1008, 42, 42], "filter_shape": [336, 1008, 1, 1], "output_shape": [128, 336, 42, 42], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_1008_42_42___336_1008_1_1___128_336_42_42_", "code": "extern \"C\" __global__ void roller_Convolution__128_1008_42_42___336_1008_1_1___128_336_42_42_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[1024];\n __shared__ float compute_d_shared[6144];\n float compute_shared_local[4];\n float compute_d_shared_local[16];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 63; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 64) + (((int)threadIdx.x) & 63)) / 1764) * 1778112) + (k_outer * 28224)) + ((((int)threadIdx.x) >> 6) * 1764)) + (((((int)blockIdx.x) * 64) + (((int)threadIdx.x) & 63)) % 1764)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[(((((((((((int)blockIdx.x) * 64) + (((int)threadIdx.x) & 63)) / 1764) * 1778112) + (k_outer * 28224)) + ((((int)threadIdx.x) >> 6) * 1764)) + (((((int)blockIdx.x) * 64) + (((int)threadIdx.x) & 63)) % 1764)) + 10584))];\n if (((int)threadIdx.x) < 256) {\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((int)blockIdx.x) * 64) + (((int)threadIdx.x) & 63)) / 1764) * 1778112) + (k_outer * 28224)) + ((((int)threadIdx.x) >> 6) * 1764)) + (((((int)blockIdx.x) * 64) + (((int)threadIdx.x) & 63)) % 1764)) + 21168))];\n }\n compute_d_shared[(((int)threadIdx.x))] = kernel[(((((((int)threadIdx.x) >> 4) * 1008) + (k_outer * 16)) + (((int)threadIdx.x) & 15)))];\n compute_d_shared[((((int)threadIdx.x) + 384))] = kernel[((((((((int)threadIdx.x) >> 4) * 1008) + (k_outer * 16)) + (((int)threadIdx.x) & 15)) + 24192))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[((((((((int)threadIdx.x) >> 4) * 1008) + (k_outer * 16)) + (((int)threadIdx.x) & 15)) + 48384))];\n compute_d_shared[((((int)threadIdx.x) + 1152))] = kernel[((((((((int)threadIdx.x) >> 4) * 1008) + (k_outer * 16)) + (((int)threadIdx.x) & 15)) + 72576))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[((((((((int)threadIdx.x) >> 4) * 1008) + (k_outer * 16)) + (((int)threadIdx.x) & 15)) + 96768))];\n compute_d_shared[((((int)threadIdx.x) + 1920))] = kernel[((((((((int)threadIdx.x) >> 4) * 1008) + (k_outer * 16)) + (((int)threadIdx.x) & 15)) + 120960))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[((((((((int)threadIdx.x) >> 4) * 1008) + (k_outer * 16)) + (((int)threadIdx.x) & 15)) + 145152))];\n compute_d_shared[((((int)threadIdx.x) + 2688))] = kernel[((((((((int)threadIdx.x) >> 4) * 1008) + (k_outer * 16)) + (((int)threadIdx.x) & 15)) + 169344))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[((((((((int)threadIdx.x) >> 4) * 1008) + (k_outer * 16)) + (((int)threadIdx.x) & 15)) + 193536))];\n compute_d_shared[((((int)threadIdx.x) + 3456))] = kernel[((((((((int)threadIdx.x) >> 4) * 1008) + (k_outer * 16)) + (((int)threadIdx.x) & 15)) + 217728))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[((((((((int)threadIdx.x) >> 4) * 1008) + (k_outer * 16)) + (((int)threadIdx.x) & 15)) + 241920))];\n compute_d_shared[((((int)threadIdx.x) + 4224))] = kernel[((((((((int)threadIdx.x) >> 4) * 1008) + (k_outer * 16)) + (((int)threadIdx.x) & 15)) + 266112))];\n compute_d_shared[((((int)threadIdx.x) + 4608))] = kernel[((((((((int)threadIdx.x) >> 4) * 1008) + (k_outer * 16)) + (((int)threadIdx.x) & 15)) + 290304))];\n compute_d_shared[((((int)threadIdx.x) + 4992))] = kernel[((((((((int)threadIdx.x) >> 4) * 1008) + (k_outer * 16)) + (((int)threadIdx.x) & 15)) + 314496))];\n compute_d_shared[((((int)threadIdx.x) + 5376))] = 0.000000e+00f;\n compute_d_shared[((((int)threadIdx.x) + 5760))] = 0.000000e+00f;\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 16; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 64) + (((int)threadIdx.x) & 15)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 16))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 32))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 48))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 4) * 16) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 16) + k_inner_outer) + 384))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 16) + k_inner_outer) + 768))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 16) + k_inner_outer) + 1152))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 16) + k_inner_outer) + 1536))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 16) + k_inner_outer) + 1920))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 16) + k_inner_outer) + 2304))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 16) + k_inner_outer) + 2688))];\n compute_d_shared_local[(8)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 16) + k_inner_outer) + 3072))];\n compute_d_shared_local[(9)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 16) + k_inner_outer) + 3456))];\n compute_d_shared_local[(10)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 16) + k_inner_outer) + 3840))];\n compute_d_shared_local[(11)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 16) + k_inner_outer) + 4224))];\n compute_d_shared_local[(12)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 16) + k_inner_outer) + 4608))];\n compute_d_shared_local[(13)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 16) + k_inner_outer) + 4992))];\n compute_d_shared_local[(14)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 16) + k_inner_outer) + 5376))];\n compute_d_shared_local[(15)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 16) + k_inner_outer) + 5760))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(8)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(0)] * compute_d_shared_local[(9)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(10)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(0)] * compute_d_shared_local[(11)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(12)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(0)] * compute_d_shared_local[(13)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(14)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(0)] * compute_d_shared_local[(15)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(8)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(1)] * compute_d_shared_local[(9)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(10)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(1)] * compute_d_shared_local[(11)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(12)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(1)] * compute_d_shared_local[(13)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(14)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(1)] * compute_d_shared_local[(15)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(8)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(2)] * compute_d_shared_local[(9)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(10)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(2)] * compute_d_shared_local[(11)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(12)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(2)] * compute_d_shared_local[(13)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(14)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(2)] * compute_d_shared_local[(15)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(8)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(3)] * compute_d_shared_local[(9)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(10)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(3)] * compute_d_shared_local[(11)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(12)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(3)] * compute_d_shared_local[(13)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(14)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(3)] * compute_d_shared_local[(15)]));\n }\n }\n compute[(((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)))] = (compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 16))] = (compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 16))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 32))] = (compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 32))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 48))] = (compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 48))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 5419008))] = (compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 5419008))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 5419024))] = (compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 5419024))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 5419040))] = (compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 5419040))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 5419056))] = (compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 5419056))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 10838016))] = (compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 10838016))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 10838032))] = (compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 10838032))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 10838048))] = (compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 10838048))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 10838064))] = (compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 10838064))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 16257024))] = (compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 16257024))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 16257040))] = (compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 16257040))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 16257056))] = (compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 16257056))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 16257072))] = (compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 16257072))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 21676032))] = (compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 21676032))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 21676048))] = (compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 21676048))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 21676064))] = (compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 21676064))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 21676080))] = (compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 21676080))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 27095040))] = (compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 27095040))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 27095056))] = (compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 27095056))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 27095072))] = (compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 27095072))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 27095088))] = (compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 27095088))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 32514048))] = (compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 32514048))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 32514064))] = (compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 32514064))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 32514080))] = (compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 32514080))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 32514096))] = (compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 32514096))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 37933056))] = (compute_local[(28)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 37933056))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 37933072))] = (compute_local[(29)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 37933072))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 37933088))] = (compute_local[(30)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 37933088))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 37933104))] = (compute_local[(31)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 37933104))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 43352064))] = (compute_local[(32)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 43352064))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 43352080))] = (compute_local[(33)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 43352080))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 43352096))] = (compute_local[(34)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 43352096))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 43352112))] = (compute_local[(35)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 43352112))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 48771072))] = (compute_local[(36)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 48771072))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 48771088))] = (compute_local[(37)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 48771088))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 48771104))] = (compute_local[(38)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 48771104))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 48771120))] = (compute_local[(39)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 48771120))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 54190080))] = (compute_local[(40)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 54190080))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 54190096))] = (compute_local[(41)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 54190096))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 54190112))] = (compute_local[(42)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 54190112))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 54190128))] = (compute_local[(43)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 54190128))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 59609088))] = (compute_local[(44)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 59609088))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 59609104))] = (compute_local[(45)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 59609104))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 59609120))] = (compute_local[(46)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 59609120))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 59609136))] = (compute_local[(47)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 59609136))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 65028096))] = (compute_local[(48)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 65028096))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 65028112))] = (compute_local[(49)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 65028112))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 65028128))] = (compute_local[(50)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 65028128))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 65028144))] = (compute_local[(51)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 65028144))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 70447104))] = (compute_local[(52)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 70447104))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 70447120))] = (compute_local[(53)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 70447120))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 70447136))] = (compute_local[(54)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 70447136))]);\n compute[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 70447152))] = (compute_local[(55)] + bias[((((((((int)threadIdx.x) >> 4) * 225792) + (((int)blockIdx.x) * 64)) + (((int)threadIdx.x) & 15)) + 70447152))]);\n}\n", "gridDim": [3528, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,1344,21,21]_[336,1344,1,1]_[128,336,21,21].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,1344,21,21]_[336,1344,1,1]_[128,336,21,21].json new file mode 100644 index 000000000..695808679 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,1344,21,21]_[336,1344,1,1]_[128,336,21,21].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 1344, 21, 21], "filter_shape": [336, 1344, 1, 1], "output_shape": [128, 336, 21, 21], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_1344_21_21___336_1344_1_1___128_336_21_21_", "code": "extern \"C\" __global__ void roller_Convolution__128_1344_21_21___336_1344_1_1___128_336_21_21_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[3072];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 42; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 592704) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 592704) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 1323))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 592704) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 2646))];\n compute_shared[((((int)threadIdx.x) + 1152))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 592704) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 3969))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 592704) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 5292))];\n compute_shared[((((int)threadIdx.x) + 1920))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 592704) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 6615))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 592704) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 7938))];\n compute_shared[((((int)threadIdx.x) + 2688))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 592704) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 9261))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 592704) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 10584))];\n compute_shared[((((int)threadIdx.x) + 3456))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 592704) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 11907))];\n if (((int)threadIdx.x) < 256) {\n compute_shared[((((int)threadIdx.x) + 3840))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 592704) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 13230))];\n }\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 441) * 129024) + ((((int)threadIdx.x) >> 5) * 1344)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 384))] = kernel[(((((((((int)blockIdx.x) / 441) * 129024) + ((((int)threadIdx.x) >> 5) * 1344)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16128))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 441) * 129024) + ((((int)threadIdx.x) >> 5) * 1344)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32256))];\n compute_d_shared[((((int)threadIdx.x) + 1152))] = kernel[(((((((((int)blockIdx.x) / 441) * 129024) + ((((int)threadIdx.x) >> 5) * 1344)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 48384))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = (((((((int)blockIdx.x) / 441) * 96) + (((int)threadIdx.x) >> 5)) < 288) ? kernel[(((((((((int)blockIdx.x) / 441) * 129024) + ((((int)threadIdx.x) >> 5) * 1344)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 64512))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1920))] = (((((((int)blockIdx.x) / 441) * 96) + (((int)threadIdx.x) >> 5)) < 276) ? kernel[(((((((((int)blockIdx.x) / 441) * 129024) + ((((int)threadIdx.x) >> 5) * 1344)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 80640))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2304))] = (((((((int)blockIdx.x) / 441) * 96) + (((int)threadIdx.x) >> 5)) < 264) ? kernel[(((((((((int)blockIdx.x) / 441) * 129024) + ((((int)threadIdx.x) >> 5) * 1344)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 96768))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2688))] = (((((((int)blockIdx.x) / 441) * 96) + (((int)threadIdx.x) >> 5)) < 252) ? kernel[(((((((((int)blockIdx.x) / 441) * 129024) + ((((int)threadIdx.x) >> 5) * 1344)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 112896))] : 0.000000e+00f);\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 384))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1152))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1920))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2304))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2688))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 32))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 64))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 96))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 677376))] = (compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 677376))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 677408))] = (compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 677408))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 677440))] = (compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 677440))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 677472))] = (compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 677472))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354752))] = (compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354752))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354784))] = (compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354784))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354816))] = (compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354816))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354848))] = (compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354848))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2032128))] = (compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2032128))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2032160))] = (compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2032160))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2032192))] = (compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2032192))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2032224))] = (compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2032224))]);\n if ((((((int)blockIdx.x) / 441) * 96) + (((int)threadIdx.x) >> 5)) < 288) {\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))] = (compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))] = (compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))] = (compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))] = (compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))]);\n }\n if ((((((int)blockIdx.x) / 441) * 96) + (((int)threadIdx.x) >> 5)) < 276) {\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3386880))] = (compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3386880))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3386912))] = (compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3386912))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3386944))] = (compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3386944))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3386976))] = (compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3386976))]);\n }\n if ((((((int)blockIdx.x) / 441) * 96) + (((int)threadIdx.x) >> 5)) < 264) {\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4064256))] = (compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4064256))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4064288))] = (compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4064288))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4064320))] = (compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4064320))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4064352))] = (compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4064352))]);\n }\n if ((((((int)blockIdx.x) / 441) * 96) + (((int)threadIdx.x) >> 5)) < 252) {\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4741632))] = (compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4741632))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4741664))] = (compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4741664))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4741696))] = (compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4741696))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4741728))] = (compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4741728))]);\n }\n}\n", "gridDim": [1764, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,168,42,42]_[168,168,1,1]_[128,168,42,42]_bias.json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,168,42,42]_[168,168,1,1]_[128,168,42,42]_bias.json new file mode 100644 index 000000000..d8a2cd45c --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,168,42,42]_[168,168,1,1]_[128,168,42,42]_bias.json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 168, 42, 42], "filter_shape": [168, 168, 1, 1], "output_shape": [128, 168, 42, 42], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_168_42_42___168_168_1_1___128_168_42_42_", "code": "extern \"C\" __global__ void roller_Convolution__128_168_42_42___168_168_1_1___128_168_42_42_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[6144];\n float compute_shared_local[4];\n float compute_d_shared_local[16];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 6; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 5292))];\n compute_shared[((((int)threadIdx.x) + 768))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 162) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 10584))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1152))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 159) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 15876))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1536))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 156) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 21168))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 153) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 26460))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 150) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 31752))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2688))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 147) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 37044))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 144) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 42336))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3456))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 141) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 47628))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 256) {\n compute_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 138) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 52920))] : 0.000000e+00f);\n }\n compute_d_shared[(((int)threadIdx.x))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[(((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 384))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2016))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 768))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4032))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1152))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 6048))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1536))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8064))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 10080))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12096))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2688))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 14112))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16128))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3456))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 18144))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 20160))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 4224))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 22176))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 4608))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24192))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 4992))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 26208))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 5376))] = 0.000000e+00f;\n compute_d_shared[((((int)threadIdx.x) + 5760))] = 0.000000e+00f;\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 384))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1152))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1920))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2304))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2688))];\n compute_d_shared_local[(8)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(9)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3456))];\n compute_d_shared_local[(10)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3840))];\n compute_d_shared_local[(11)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 4224))];\n compute_d_shared_local[(12)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 4608))];\n compute_d_shared_local[(13)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 4992))];\n compute_d_shared_local[(14)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 5376))];\n compute_d_shared_local[(15)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 5760))];\n if (((k_outer * 32) + k_inner_outer) < 168) {\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(8)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(0)] * compute_d_shared_local[(9)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(10)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(0)] * compute_d_shared_local[(11)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(12)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(0)] * compute_d_shared_local[(13)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(14)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(0)] * compute_d_shared_local[(15)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(8)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(1)] * compute_d_shared_local[(9)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(10)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(1)] * compute_d_shared_local[(11)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(12)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(1)] * compute_d_shared_local[(13)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(14)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(1)] * compute_d_shared_local[(15)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(8)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(2)] * compute_d_shared_local[(9)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(10)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(2)] * compute_d_shared_local[(11)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(12)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(2)] * compute_d_shared_local[(13)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(14)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(2)] * compute_d_shared_local[(15)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(8)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(3)] * compute_d_shared_local[(9)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(10)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(3)] * compute_d_shared_local[(11)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(12)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(3)] * compute_d_shared_local[(13)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(14)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(3)] * compute_d_shared_local[(15)]));\n }\n }\n }\n compute[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))] = (compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))] = (compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))] = (compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))] = (compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419008))] = (compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419008))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419040))] = (compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419040))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419072))] = (compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419072))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419104))] = (compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419104))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128512))] = (compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128512))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128544))] = (compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128544))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128576))] = (compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128576))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128608))] = (compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128608))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838016))] = (compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838016))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838048))] = (compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838048))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838080))] = (compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838080))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838112))] = (compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838112))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547520))] = (compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547520))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547552))] = (compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547552))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547584))] = (compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547584))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547616))] = (compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547616))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257024))] = (compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257024))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257056))] = (compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257056))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257088))] = (compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257088))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257120))] = (compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257120))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966528))] = (compute_local[(28)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966528))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966560))] = (compute_local[(29)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966560))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966592))] = (compute_local[(30)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966592))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966624))] = (compute_local[(31)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966624))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676032))] = (compute_local[(32)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676032))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676064))] = (compute_local[(33)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676064))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676096))] = (compute_local[(34)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676096))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676128))] = (compute_local[(35)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676128))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385536))] = (compute_local[(36)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385536))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385568))] = (compute_local[(37)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385568))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385600))] = (compute_local[(38)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385600))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385632))] = (compute_local[(39)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385632))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095040))] = (compute_local[(40)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095040))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095072))] = (compute_local[(41)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095072))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095104))] = (compute_local[(42)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095104))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095136))] = (compute_local[(43)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095136))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804544))] = (compute_local[(44)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804544))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804576))] = (compute_local[(45)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804576))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804608))] = (compute_local[(46)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804608))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804640))] = (compute_local[(47)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804640))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514048))] = (compute_local[(48)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514048))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514080))] = (compute_local[(49)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514080))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514112))] = (compute_local[(50)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514112))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514144))] = (compute_local[(51)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514144))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223552))] = (compute_local[(52)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223552))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223584))] = (compute_local[(53)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223584))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223616))] = (compute_local[(54)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223616))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223648))] = (compute_local[(55)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223648))]);\n}\n", "gridDim": [1764, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,168,42,42]_[168,168,1,1]_[128,168,42,42]_relu.json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,168,42,42]_[168,168,1,1]_[128,168,42,42]_relu.json new file mode 100644 index 000000000..01c112a52 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,168,42,42]_[168,168,1,1]_[128,168,42,42]_relu.json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 168, 42, 42], "filter_shape": [168, 168, 1, 1], "output_shape": [128, 168, 42, 42], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_168_42_42___168_168_1_1___128_168_42_42__relu", "code": "extern \"C\" __global__ void roller_Convolution__128_168_42_42___168_168_1_1___128_168_42_42__relu(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[6144];\n float compute_shared_local[4];\n float compute_d_shared_local[16];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 6; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 5292))];\n compute_shared[((((int)threadIdx.x) + 768))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 162) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 10584))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1152))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 159) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 15876))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1536))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 156) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 21168))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 153) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 26460))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 150) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 31752))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2688))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 147) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 37044))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 144) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 42336))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3456))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 141) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 47628))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 256) {\n compute_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 138) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 52920))] : 0.000000e+00f);\n }\n compute_d_shared[(((int)threadIdx.x))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[(((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 384))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2016))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 768))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4032))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1152))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 6048))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1536))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8064))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 10080))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12096))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2688))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 14112))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16128))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3456))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 18144))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 20160))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 4224))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 22176))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 4608))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24192))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 4992))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 26208))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 5376))] = 0.000000e+00f;\n compute_d_shared[((((int)threadIdx.x) + 5760))] = 0.000000e+00f;\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 384))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1152))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1920))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2304))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2688))];\n compute_d_shared_local[(8)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(9)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3456))];\n compute_d_shared_local[(10)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3840))];\n compute_d_shared_local[(11)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 4224))];\n compute_d_shared_local[(12)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 4608))];\n compute_d_shared_local[(13)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 4992))];\n compute_d_shared_local[(14)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 5376))];\n compute_d_shared_local[(15)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 5760))];\n if (((k_outer * 32) + k_inner_outer) < 168) {\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(8)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(0)] * compute_d_shared_local[(9)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(10)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(0)] * compute_d_shared_local[(11)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(12)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(0)] * compute_d_shared_local[(13)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(14)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(0)] * compute_d_shared_local[(15)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(8)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(1)] * compute_d_shared_local[(9)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(10)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(1)] * compute_d_shared_local[(11)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(12)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(1)] * compute_d_shared_local[(13)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(14)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(1)] * compute_d_shared_local[(15)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(8)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(2)] * compute_d_shared_local[(9)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(10)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(2)] * compute_d_shared_local[(11)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(12)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(2)] * compute_d_shared_local[(13)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(14)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(2)] * compute_d_shared_local[(15)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(8)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(3)] * compute_d_shared_local[(9)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(10)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(3)] * compute_d_shared_local[(11)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(12)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(3)] * compute_d_shared_local[(13)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(14)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(3)] * compute_d_shared_local[(15)]));\n }\n }\n }\n compute[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))] = max((compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = max((compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = max((compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = max((compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))] = max((compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))] = max((compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))] = max((compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))] = max((compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419008))] = max((compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419008))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419040))] = max((compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419040))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419072))] = max((compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419072))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419104))] = max((compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419104))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128512))] = max((compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128512))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128544))] = max((compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128544))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128576))] = max((compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128576))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128608))] = max((compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128608))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838016))] = max((compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838016))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838048))] = max((compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838048))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838080))] = max((compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838080))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838112))] = max((compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838112))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547520))] = max((compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547520))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547552))] = max((compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547552))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547584))] = max((compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547584))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547616))] = max((compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547616))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257024))] = max((compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257024))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257056))] = max((compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257056))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257088))] = max((compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257088))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257120))] = max((compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257120))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966528))] = max((compute_local[(28)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966528))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966560))] = max((compute_local[(29)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966560))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966592))] = max((compute_local[(30)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966592))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966624))] = max((compute_local[(31)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966624))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676032))] = max((compute_local[(32)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676032))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676064))] = max((compute_local[(33)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676064))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676096))] = max((compute_local[(34)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676096))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676128))] = max((compute_local[(35)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676128))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385536))] = max((compute_local[(36)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385536))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385568))] = max((compute_local[(37)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385568))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385600))] = max((compute_local[(38)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385600))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385632))] = max((compute_local[(39)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385632))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095040))] = max((compute_local[(40)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095040))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095072))] = max((compute_local[(41)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095072))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095104))] = max((compute_local[(42)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095104))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095136))] = max((compute_local[(43)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095136))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804544))] = max((compute_local[(44)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804544))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804576))] = max((compute_local[(45)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804576))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804608))] = max((compute_local[(46)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804608))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804640))] = max((compute_local[(47)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804640))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514048))] = max((compute_local[(48)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514048))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514080))] = max((compute_local[(49)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514080))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514112))] = max((compute_local[(50)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514112))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514144))] = max((compute_local[(51)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514144))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223552))] = max((compute_local[(52)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223552))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223584))] = max((compute_local[(53)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223584))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223616))] = max((compute_local[(54)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223616))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223648))] = max((compute_local[(55)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223648))]), 0.000000e+00f);\n}\n", "gridDim": [1764, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,168,42,42]_[84,168,1,1]_[128,84,42,42].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,168,42,42]_[84,168,1,1]_[128,84,42,42].json new file mode 100644 index 000000000..9fc2c38ae --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,168,42,42]_[84,168,1,1]_[128,84,42,42].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 168, 42, 42], "filter_shape": [84, 168, 1, 1], "output_shape": [128, 84, 42, 42], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Convolution", "tvm_func_name": "roller_Convolution__128_168_42_42___84_168_1_1___128_84_42_42_", "code": "extern \"C\" __global__ void roller_Convolution__128_168_42_42___84_168_1_1___128_84_42_42_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float compute_local[64];\n __shared__ float compute_shared[8192];\n __shared__ float compute_d_shared[3072];\n float compute_shared_local[8];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 6; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 1764)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[((((((((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) / 1764) * 296352) + (k_outer * 56448)) + (((((int)threadIdx.x) + 384) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) % 1764)))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 1764)) + 5292))];\n compute_shared[((((int)threadIdx.x) + 1152))] = data[((((((((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) / 1764) * 296352) + (k_outer * 56448)) + (((((int)threadIdx.x) + 1152) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) % 1764)))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 1764)) + 10584))];\n compute_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + ((((int)threadIdx.x) + 1920) >> 8)) < 168) ? data[((((((((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) / 1764) * 296352) + (k_outer * 56448)) + (((((int)threadIdx.x) + 1920) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) % 1764)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 8)) < 159) ? data[(((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 1764)) + 15876))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2688))] = ((((k_outer * 32) + ((((int)threadIdx.x) + 2688) >> 8)) < 168) ? data[((((((((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) / 1764) * 296352) + (k_outer * 56448)) + (((((int)threadIdx.x) + 2688) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) % 1764)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 8)) < 156) ? data[(((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 1764)) + 21168))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3456))] = ((((k_outer * 32) + ((((int)threadIdx.x) + 3456) >> 8)) < 168) ? data[((((((((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) / 1764) * 296352) + (k_outer * 56448)) + (((((int)threadIdx.x) + 3456) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) % 1764)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 8)) < 153) ? data[(((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 1764)) + 26460))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 4224))] = ((((k_outer * 32) + ((((int)threadIdx.x) + 4224) >> 8)) < 168) ? data[((((((((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) / 1764) * 296352) + (k_outer * 56448)) + (((((int)threadIdx.x) + 4224) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) % 1764)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 4608))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 8)) < 150) ? data[(((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 1764)) + 31752))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 4992))] = ((((k_outer * 32) + ((((int)threadIdx.x) + 4992) >> 8)) < 168) ? data[((((((((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) / 1764) * 296352) + (k_outer * 56448)) + (((((int)threadIdx.x) + 4992) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) % 1764)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 5376))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 8)) < 147) ? data[(((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 1764)) + 37044))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 5760))] = ((((k_outer * 32) + ((((int)threadIdx.x) + 5760) >> 8)) < 168) ? data[((((((((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) / 1764) * 296352) + (k_outer * 56448)) + (((((int)threadIdx.x) + 5760) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) % 1764)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 6144))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 8)) < 144) ? data[(((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 1764)) + 42336))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 6528))] = ((((k_outer * 32) + ((((int)threadIdx.x) + 6528) >> 8)) < 168) ? data[((((((((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) / 1764) * 296352) + (k_outer * 56448)) + (((((int)threadIdx.x) + 6528) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) % 1764)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 6912))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 8)) < 141) ? data[(((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 1764)) + 47628))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 7296))] = ((((k_outer * 32) + ((((int)threadIdx.x) + 7296) >> 8)) < 168) ? data[((((((((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) / 1764) * 296352) + (k_outer * 56448)) + (((((int)threadIdx.x) + 7296) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + ((((int)threadIdx.x) + 128) & 255)) % 1764)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 7680))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 8)) < 138) ? data[(((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 1764) * 296352) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 1764)) + 52920))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 128) {\n compute_shared[((((int)threadIdx.x) + 8064))] = ((((k_outer * 32) + ((((int)threadIdx.x) + 8064) >> 8)) < 168) ? data[((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) + 128)) / 1764) * 296352) + (k_outer * 56448)) + (((((int)threadIdx.x) + 8064) >> 8) * 1764)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) + 128)) % 1764)))] : 0.000000e+00f);\n }\n compute_d_shared[(((int)threadIdx.x))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[(((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 384))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2016))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 768))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4032))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1152))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 6048))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1536))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8064))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 10080))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12096))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2688))] = 0.000000e+00f;\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 256) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 96))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 128))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 160))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 192))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 224))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 384))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1152))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1920))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2304))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2688))];\n if (((k_outer * 32) + k_inner_outer) < 168) {\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(2)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(4)] * compute_d_shared_local[(3)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(4)] * compute_d_shared_local[(4)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(4)] * compute_d_shared_local[(5)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(4)] * compute_d_shared_local[(6)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(4)] * compute_d_shared_local[(7)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(2)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(5)] * compute_d_shared_local[(3)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(5)] * compute_d_shared_local[(4)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(5)] * compute_d_shared_local[(5)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(5)] * compute_d_shared_local[(6)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(5)] * compute_d_shared_local[(7)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(2)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(6)] * compute_d_shared_local[(3)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(6)] * compute_d_shared_local[(4)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(6)] * compute_d_shared_local[(5)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(6)] * compute_d_shared_local[(6)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(6)] * compute_d_shared_local[(7)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(2)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(7)] * compute_d_shared_local[(3)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(7)] * compute_d_shared_local[(4)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(7)] * compute_d_shared_local[(5)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(7)] * compute_d_shared_local[(6)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(7)] * compute_d_shared_local[(7)]));\n }\n }\n }\n compute[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)))] = compute_local[(0)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 32))] = compute_local[(1)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64))] = compute_local[(2)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 96))] = compute_local[(3)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 128))] = compute_local[(4)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 160))] = compute_local[(5)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 192))] = compute_local[(6)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 224))] = compute_local[(7)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 2709504))] = compute_local[(8)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 2709536))] = compute_local[(9)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 2709568))] = compute_local[(10)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 2709600))] = compute_local[(11)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 2709632))] = compute_local[(12)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 2709664))] = compute_local[(13)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 2709696))] = compute_local[(14)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 2709728))] = compute_local[(15)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 5419008))] = compute_local[(16)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 5419040))] = compute_local[(17)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 5419072))] = compute_local[(18)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 5419104))] = compute_local[(19)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 5419136))] = compute_local[(20)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 5419168))] = compute_local[(21)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 5419200))] = compute_local[(22)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 5419232))] = compute_local[(23)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 8128512))] = compute_local[(24)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 8128544))] = compute_local[(25)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 8128576))] = compute_local[(26)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 8128608))] = compute_local[(27)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 8128640))] = compute_local[(28)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 8128672))] = compute_local[(29)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 8128704))] = compute_local[(30)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 8128736))] = compute_local[(31)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 10838016))] = compute_local[(32)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 10838048))] = compute_local[(33)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 10838080))] = compute_local[(34)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 10838112))] = compute_local[(35)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 10838144))] = compute_local[(36)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 10838176))] = compute_local[(37)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 10838208))] = compute_local[(38)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 10838240))] = compute_local[(39)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 13547520))] = compute_local[(40)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 13547552))] = compute_local[(41)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 13547584))] = compute_local[(42)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 13547616))] = compute_local[(43)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 13547648))] = compute_local[(44)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 13547680))] = compute_local[(45)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 13547712))] = compute_local[(46)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 13547744))] = compute_local[(47)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16257024))] = compute_local[(48)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16257056))] = compute_local[(49)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16257088))] = compute_local[(50)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16257120))] = compute_local[(51)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16257152))] = compute_local[(52)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16257184))] = compute_local[(53)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16257216))] = compute_local[(54)];\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16257248))] = compute_local[(55)];\n}\n", "gridDim": [882, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,168,83,83]_[84,168,1,1]_[128,84,83,83].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,168,83,83]_[84,168,1,1]_[128,84,83,83].json new file mode 100644 index 000000000..c469670ac --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,168,83,83]_[84,168,1,1]_[128,84,83,83].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 168, 83, 83], "filter_shape": [84, 168, 1, 1], "output_shape": [128, 84, 83, 83], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_168_83_83___84_168_1_1___128_84_83_83_", "code": "extern \"C\" __global__ void roller_Convolution__128_168_83_83___84_168_1_1___128_84_83_83_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[4];\n float compute_d_shared_local[16];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 6; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 6889) * 1157352) + (k_outer * 220448)) + ((((int)threadIdx.x) >> 7) * 6889)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 6889)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 6889) * 1157352) + (k_outer * 220448)) + ((((int)threadIdx.x) >> 7) * 6889)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 6889)) + 13778))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 6889) * 1157352) + (k_outer * 220448)) + ((((int)threadIdx.x) >> 7) * 6889)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 6889)) + 27556))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 6889) * 1157352) + (k_outer * 220448)) + ((((int)threadIdx.x) >> 7) * 6889)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 6889)) + 41334))];\n compute_shared[((((int)threadIdx.x) + 1024))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 160) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 6889) * 1157352) + (k_outer * 220448)) + ((((int)threadIdx.x) >> 7) * 6889)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 6889)) + 55112))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1280))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 158) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 6889) * 1157352) + (k_outer * 220448)) + ((((int)threadIdx.x) >> 7) * 6889)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 6889)) + 68890))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1536))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 156) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 6889) * 1157352) + (k_outer * 220448)) + ((((int)threadIdx.x) >> 7) * 6889)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 6889)) + 82668))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1792))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 154) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 6889) * 1157352) + (k_outer * 220448)) + ((((int)threadIdx.x) >> 7) * 6889)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 6889)) + 96446))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2048))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 152) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 6889) * 1157352) + (k_outer * 220448)) + ((((int)threadIdx.x) >> 7) * 6889)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 6889)) + 110224))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 150) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 6889) * 1157352) + (k_outer * 220448)) + ((((int)threadIdx.x) >> 7) * 6889)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 6889)) + 124002))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2560))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 148) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 6889) * 1157352) + (k_outer * 220448)) + ((((int)threadIdx.x) >> 7) * 6889)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 6889)) + 137780))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2816))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 146) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 6889) * 1157352) + (k_outer * 220448)) + ((((int)threadIdx.x) >> 7) * 6889)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 6889)) + 151558))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 144) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 6889) * 1157352) + (k_outer * 220448)) + ((((int)threadIdx.x) >> 7) * 6889)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 6889)) + 165336))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3328))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 142) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 6889) * 1157352) + (k_outer * 220448)) + ((((int)threadIdx.x) >> 7) * 6889)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 6889)) + 179114))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3584))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 140) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 6889) * 1157352) + (k_outer * 220448)) + ((((int)threadIdx.x) >> 7) * 6889)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 6889)) + 192892))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 138) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 6889) * 1157352) + (k_outer * 220448)) + ((((int)threadIdx.x) >> 7) * 6889)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 6889)) + 206670))] : 0.000000e+00f);\n compute_d_shared[(((int)threadIdx.x))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[(((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 256))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1344))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 512))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2688))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 768))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4032))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1024))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 5376))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1280))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 6720))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1536))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8064))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1792))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 9408))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2048))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 10752))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12096))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2560))] = (((((int)threadIdx.x) < 128) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 168)) ? kernel[((((((((int)threadIdx.x) >> 5) * 168) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 13440))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2816))] = 0.000000e+00f;\n compute_d_shared[((((int)threadIdx.x) + 3072))] = 0.000000e+00f;\n compute_d_shared[((((int)threadIdx.x) + 3328))] = 0.000000e+00f;\n compute_d_shared[((((int)threadIdx.x) + 3584))] = 0.000000e+00f;\n compute_d_shared[((((int)threadIdx.x) + 3840))] = 0.000000e+00f;\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 256))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1280))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1792))];\n compute_d_shared_local[(8)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(9)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2304))];\n compute_d_shared_local[(10)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2560))];\n compute_d_shared_local[(11)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2816))];\n compute_d_shared_local[(12)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(13)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3328))];\n compute_d_shared_local[(14)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3584))];\n compute_d_shared_local[(15)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3840))];\n if (((k_outer * 32) + k_inner_outer) < 168) {\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(8)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(0)] * compute_d_shared_local[(9)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(10)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(0)] * compute_d_shared_local[(11)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(12)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(0)] * compute_d_shared_local[(13)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(14)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(0)] * compute_d_shared_local[(15)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(8)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(1)] * compute_d_shared_local[(9)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(10)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(1)] * compute_d_shared_local[(11)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(12)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(1)] * compute_d_shared_local[(13)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(14)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(1)] * compute_d_shared_local[(15)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(8)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(2)] * compute_d_shared_local[(9)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(10)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(2)] * compute_d_shared_local[(11)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(12)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(2)] * compute_d_shared_local[(13)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(14)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(2)] * compute_d_shared_local[(15)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(8)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(3)] * compute_d_shared_local[(9)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(10)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(3)] * compute_d_shared_local[(11)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(12)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(3)] * compute_d_shared_local[(13)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(14)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(3)] * compute_d_shared_local[(15)]));\n }\n }\n }\n compute[(((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 7054336))] = (compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 7054336))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 7054368))] = (compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 7054368))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 7054400))] = (compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 7054400))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 7054432))] = (compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 7054432))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 14108672))] = (compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 14108672))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 14108704))] = (compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 14108704))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 14108736))] = (compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 14108736))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 14108768))] = (compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 14108768))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21163008))] = (compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21163008))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21163040))] = (compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21163040))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21163072))] = (compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21163072))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21163104))] = (compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21163104))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 28217344))] = (compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 28217344))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 28217376))] = (compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 28217376))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 28217408))] = (compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 28217408))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 28217440))] = (compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 28217440))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35271680))] = (compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35271680))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35271712))] = (compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35271712))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35271744))] = (compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35271744))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35271776))] = (compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35271776))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 42326016))] = (compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 42326016))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 42326048))] = (compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 42326048))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 42326080))] = (compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 42326080))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 42326112))] = (compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 42326112))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 49380352))] = (compute_local[(28)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 49380352))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 49380384))] = (compute_local[(29)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 49380384))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 49380416))] = (compute_local[(30)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 49380416))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 49380448))] = (compute_local[(31)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 49380448))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 56434688))] = (compute_local[(32)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 56434688))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 56434720))] = (compute_local[(33)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 56434720))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 56434752))] = (compute_local[(34)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 56434752))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 56434784))] = (compute_local[(35)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 56434784))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 63489024))] = (compute_local[(36)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 63489024))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 63489056))] = (compute_local[(37)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 63489056))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 63489088))] = (compute_local[(38)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 63489088))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 63489120))] = (compute_local[(39)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 63489120))]);\n if (((int)threadIdx.x) < 128) {\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 70543360))] = (compute_local[(40)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 70543360))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 70543392))] = (compute_local[(41)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 70543392))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 70543424))] = (compute_local[(42)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 70543424))]);\n compute[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 70543456))] = (compute_local[(43)] + bias[((((((((int)threadIdx.x) >> 5) * 881792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 70543456))]);\n }\n}\n", "gridDim": [6889, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,2016,11,11]_[336,2016,1,1]_[128,336,11,11].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,2016,11,11]_[336,2016,1,1]_[128,336,11,11].json new file mode 100644 index 000000000..41ee439bf --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,2016,11,11]_[336,2016,1,1]_[128,336,11,11].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 2016, 11, 11], "filter_shape": [336, 2016, 1, 1], "output_shape": [128, 336, 11, 11], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Convolution", "tvm_func_name": "roller_Convolution__128_2016_11_11___336_2016_1_1___128_336_11_11_", "code": "extern \"C\" __global__ void roller_Convolution__128_2016_11_11___336_2016_1_1___128_336_11_11_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float compute_local[32];\n __shared__ float compute_shared[3072];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 63; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = (((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) < 15488) ? data[(((((((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) / 121) * 243936) + (k_outer * 3872)) + ((((int)threadIdx.x) / 96) * 121)) + ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) % 121)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 384))] = (((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) < 15488) ? data[((((((((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) / 121) * 243936) + (k_outer * 3872)) + ((((int)threadIdx.x) / 96) * 121)) + ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) % 121)) + 484))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 768))] = (((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) < 15488) ? data[((((((((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) / 121) * 243936) + (k_outer * 3872)) + ((((int)threadIdx.x) / 96) * 121)) + ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) % 121)) + 968))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1152))] = (((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) < 15488) ? data[((((((((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) / 121) * 243936) + (k_outer * 3872)) + ((((int)threadIdx.x) / 96) * 121)) + ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) % 121)) + 1452))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1536))] = (((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) < 15488) ? data[((((((((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) / 121) * 243936) + (k_outer * 3872)) + ((((int)threadIdx.x) / 96) * 121)) + ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) % 121)) + 1936))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1920))] = (((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) < 15488) ? data[((((((((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) / 121) * 243936) + (k_outer * 3872)) + ((((int)threadIdx.x) / 96) * 121)) + ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) % 121)) + 2420))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2304))] = (((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) < 15488) ? data[((((((((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) / 121) * 243936) + (k_outer * 3872)) + ((((int)threadIdx.x) / 96) * 121)) + ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) % 121)) + 2904))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2688))] = (((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) < 15488) ? data[((((((((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) / 121) * 243936) + (k_outer * 3872)) + ((((int)threadIdx.x) / 96) * 121)) + ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 96)) % 121)) + 3388))] : 0.000000e+00f);\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 162) * 258048) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 384))] = kernel[(((((((((int)blockIdx.x) / 162) * 258048) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24192))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 162) * 258048) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 48384))];\n compute_d_shared[((((int)threadIdx.x) + 1152))] = kernel[(((((((((int)blockIdx.x) / 162) * 258048) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 72576))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 162) * 258048) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 96768))];\n compute_d_shared[((((int)threadIdx.x) + 1920))] = kernel[(((((((((int)blockIdx.x) / 162) * 258048) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 120960))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = (((((((int)blockIdx.x) / 162) * 128) + (((int)threadIdx.x) >> 5)) < 264) ? kernel[(((((((((int)blockIdx.x) / 162) * 258048) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 145152))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2688))] = (((((((int)blockIdx.x) / 162) * 128) + (((int)threadIdx.x) >> 5)) < 252) ? kernel[(((((((((int)blockIdx.x) / 162) * 258048) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 169344))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3072))] = (((((((int)blockIdx.x) / 162) * 128) + (((int)threadIdx.x) >> 5)) < 240) ? kernel[(((((((((int)blockIdx.x) / 162) * 258048) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 193536))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3456))] = (((((((int)blockIdx.x) / 162) * 128) + (((int)threadIdx.x) >> 5)) < 228) ? kernel[(((((((((int)blockIdx.x) / 162) * 258048) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 217728))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 256) {\n compute_d_shared[((((int)threadIdx.x) + 3840))] = (((((((int)blockIdx.x) / 162) * 128) + (((int)threadIdx.x) >> 5)) < 216) ? kernel[(((((((((int)blockIdx.x) / 162) * 258048) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 241920))] : 0.000000e+00f);\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 96) + (((int)threadIdx.x) % 24)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 96) + (((int)threadIdx.x) % 24)) + 24))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 96) + (((int)threadIdx.x) % 24)) + 48))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 96) + (((int)threadIdx.x) % 24)) + 72))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) / 24) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) / 24) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) / 24) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) / 24) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) / 24) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) / 24) * 32) + k_inner_outer) + 2560))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) / 24) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) / 24) * 32) + k_inner_outer) + 3584))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)))] = compute_local[(0)];\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15464) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 24))] = compute_local[(1)];\n }\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15440) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 48))] = compute_local[(2)];\n }\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15416) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 72))] = compute_local[(3)];\n }\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 247808))] = compute_local[(4)];\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15464) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 247832))] = compute_local[(5)];\n }\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15440) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 247856))] = compute_local[(6)];\n }\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15416) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 247880))] = compute_local[(7)];\n }\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 495616))] = compute_local[(8)];\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15464) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 495640))] = compute_local[(9)];\n }\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15440) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 495664))] = compute_local[(10)];\n }\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15416) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 495688))] = compute_local[(11)];\n }\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 743424))] = compute_local[(12)];\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15464) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 743448))] = compute_local[(13)];\n }\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15440) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 743472))] = compute_local[(14)];\n }\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15416) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 743496))] = compute_local[(15)];\n }\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 991232))] = compute_local[(16)];\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15464) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 991256))] = compute_local[(17)];\n }\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15440) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 991280))] = compute_local[(18)];\n }\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15416) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 991304))] = compute_local[(19)];\n }\n if ((((((int)blockIdx.x) / 162) * 128) + (((int)threadIdx.x) / 24)) < 256) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 1239040))] = compute_local[(20)];\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15464) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 1239064))] = compute_local[(21)];\n }\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15440) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 1239088))] = compute_local[(22)];\n }\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15416) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 1239112))] = compute_local[(23)];\n }\n }\n if ((((((int)blockIdx.x) / 162) * 128) + (((int)threadIdx.x) / 24)) < 240) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 1486848))] = compute_local[(24)];\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15464) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 1486872))] = compute_local[(25)];\n }\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15440) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 1486896))] = compute_local[(26)];\n }\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15416) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 1486920))] = compute_local[(27)];\n }\n }\n if ((((((int)blockIdx.x) / 162) * 128) + (((int)threadIdx.x) / 24)) < 224) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 1734656))] = compute_local[(28)];\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15464) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 1734680))] = compute_local[(29)];\n }\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15440) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 1734704))] = compute_local[(30)];\n }\n if ((((((int)blockIdx.x) % 162) * 96) + (((int)threadIdx.x) % 24)) < 15416) {\n compute[(((((((((int)blockIdx.x) / 162) * 1982464) + ((((int)threadIdx.x) / 24) * 15488)) + ((((int)blockIdx.x) % 162) * 96)) + (((int)threadIdx.x) % 24)) + 1734728))] = compute_local[(31)];\n }\n }\n}\n", "gridDim": [486, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,2016,21,21]_[336,2016,1,1]_[128,336,21,21].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,2016,21,21]_[336,2016,1,1]_[128,336,21,21].json new file mode 100644 index 000000000..8d574e4df --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,2016,21,21]_[336,2016,1,1]_[128,336,21,21].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 2016, 21, 21], "filter_shape": [336, 2016, 1, 1], "output_shape": [128, 336, 21, 21], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_2016_21_21___336_2016_1_1___128_336_21_21_", "code": "extern \"C\" __global__ void roller_Convolution__128_2016_21_21___336_2016_1_1___128_336_21_21_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[3072];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 63; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 1323))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 2646))];\n compute_shared[((((int)threadIdx.x) + 1152))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 3969))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 5292))];\n compute_shared[((((int)threadIdx.x) + 1920))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 6615))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 7938))];\n compute_shared[((((int)threadIdx.x) + 2688))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 9261))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 10584))];\n compute_shared[((((int)threadIdx.x) + 3456))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 11907))];\n if (((int)threadIdx.x) < 256) {\n compute_shared[((((int)threadIdx.x) + 3840))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 13230))];\n }\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 441) * 193536) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 384))] = kernel[(((((((((int)blockIdx.x) / 441) * 193536) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24192))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 441) * 193536) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 48384))];\n compute_d_shared[((((int)threadIdx.x) + 1152))] = kernel[(((((((((int)blockIdx.x) / 441) * 193536) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 72576))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = (((((((int)blockIdx.x) / 441) * 96) + (((int)threadIdx.x) >> 5)) < 288) ? kernel[(((((((((int)blockIdx.x) / 441) * 193536) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 96768))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1920))] = (((((((int)blockIdx.x) / 441) * 96) + (((int)threadIdx.x) >> 5)) < 276) ? kernel[(((((((((int)blockIdx.x) / 441) * 193536) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 120960))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2304))] = (((((((int)blockIdx.x) / 441) * 96) + (((int)threadIdx.x) >> 5)) < 264) ? kernel[(((((((((int)blockIdx.x) / 441) * 193536) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 145152))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2688))] = (((((((int)blockIdx.x) / 441) * 96) + (((int)threadIdx.x) >> 5)) < 252) ? kernel[(((((((((int)blockIdx.x) / 441) * 193536) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 169344))] : 0.000000e+00f);\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 384))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1152))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1920))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2304))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2688))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 32))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 64))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 96))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 677376))] = (compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 677376))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 677408))] = (compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 677408))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 677440))] = (compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 677440))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 677472))] = (compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 677472))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354752))] = (compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354752))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354784))] = (compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354784))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354816))] = (compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354816))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354848))] = (compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354848))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2032128))] = (compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2032128))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2032160))] = (compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2032160))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2032192))] = (compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2032192))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2032224))] = (compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2032224))]);\n if ((((((int)blockIdx.x) / 441) * 96) + (((int)threadIdx.x) >> 5)) < 288) {\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))] = (compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))] = (compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))] = (compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))] = (compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))]);\n }\n if ((((((int)blockIdx.x) / 441) * 96) + (((int)threadIdx.x) >> 5)) < 276) {\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3386880))] = (compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3386880))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3386912))] = (compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3386912))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3386944))] = (compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3386944))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3386976))] = (compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3386976))]);\n }\n if ((((((int)blockIdx.x) / 441) * 96) + (((int)threadIdx.x) >> 5)) < 264) {\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4064256))] = (compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4064256))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4064288))] = (compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4064288))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4064320))] = (compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4064320))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4064352))] = (compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4064352))]);\n }\n if ((((((int)blockIdx.x) / 441) * 96) + (((int)threadIdx.x) >> 5)) < 252) {\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4741632))] = (compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4741632))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4741664))] = (compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4741664))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4741696))] = (compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4741696))]);\n compute[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4741728))] = (compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 441) * 5419008) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 4741728))]);\n }\n}\n", "gridDim": [1764, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,2016,21,21]_[672,2016,1,1]_[128,672,21,21].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,2016,21,21]_[672,2016,1,1]_[128,672,21,21].json new file mode 100644 index 000000000..0d7c82033 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,2016,21,21]_[672,2016,1,1]_[128,672,21,21].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 2016, 21, 21], "filter_shape": [672, 2016, 1, 1], "output_shape": [128, 672, 21, 21], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_2016_21_21___672_2016_1_1___128_672_21_21_", "code": "extern \"C\" __global__ void roller_Convolution__128_2016_21_21___672_2016_1_1___128_672_21_21_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[2048];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 63; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 882))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 1764))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 2646))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 3528))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 4410))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 5292))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 6174))];\n compute_shared[((((int)threadIdx.x) + 2048))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 7056))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 7938))];\n compute_shared[((((int)threadIdx.x) + 2560))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 8820))];\n compute_shared[((((int)threadIdx.x) + 2816))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 9702))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 10584))];\n compute_shared[((((int)threadIdx.x) + 3328))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 11466))];\n compute_shared[((((int)threadIdx.x) + 3584))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 12348))];\n compute_shared[((((int)threadIdx.x) + 3840))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 889056) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 13230))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 441) * 129024) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 441) * 129024) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16128))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 441) * 129024) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32256))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 441) * 129024) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 48384))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = (((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 640) ? kernel[(((((((((int)blockIdx.x) / 441) * 129024) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 64512))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1280))] = (((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 632) ? kernel[(((((((((int)blockIdx.x) / 441) * 129024) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 80640))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1536))] = (((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 624) ? kernel[(((((((((int)blockIdx.x) / 441) * 129024) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 96768))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1792))] = (((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 616) ? kernel[(((((((((int)blockIdx.x) / 441) * 129024) + ((((int)threadIdx.x) >> 5) * 2016)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 112896))] : 0.000000e+00f);\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 256))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1280))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1792))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 32))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 64))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 96))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451584))] = (compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451584))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451616))] = (compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451616))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451648))] = (compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451648))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451680))] = (compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451680))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903168))] = (compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903168))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903200))] = (compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903200))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903232))] = (compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903232))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903264))] = (compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903264))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354752))] = (compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354752))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354784))] = (compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354784))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354816))] = (compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354816))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354848))] = (compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354848))]);\n if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 640) {\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806336))] = (compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806336))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806368))] = (compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806368))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806400))] = (compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806400))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806432))] = (compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806432))]);\n }\n if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 632) {\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257920))] = (compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257920))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257952))] = (compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257952))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257984))] = (compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257984))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2258016))] = (compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2258016))]);\n }\n if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 624) {\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))] = (compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))] = (compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))] = (compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))] = (compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))]);\n }\n if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 616) {\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161088))] = (compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161088))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161120))] = (compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161120))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161152))] = (compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161152))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161184))] = (compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161184))]);\n }\n}\n", "gridDim": [4851, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,2688,11,11]_[672,2688,1,1]_[128,672,11,11].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,2688,11,11]_[672,2688,1,1]_[128,672,11,11].json new file mode 100644 index 000000000..8fba2037c --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,2688,11,11]_[672,2688,1,1]_[128,672,11,11].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 2688, 11, 11], "filter_shape": [672, 2688, 1, 1], "output_shape": [128, 672, 11, 11], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_2688_11_11___672_2688_1_1___128_672_11_11_", "code": "extern \"C\" __global__ void roller_Convolution__128_2688_11_11___672_2688_1_1___128_672_11_11_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[3072];\n float compute_shared_local[8];\n float compute_d_shared_local[4];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 84; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 325248) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 325248) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 363))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 325248) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 726))];\n compute_shared[((((int)threadIdx.x) + 1152))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 325248) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 1089))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 325248) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 1452))];\n compute_shared[((((int)threadIdx.x) + 1920))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 325248) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 1815))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 325248) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 2178))];\n compute_shared[((((int)threadIdx.x) + 2688))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 325248) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 2541))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 325248) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 2904))];\n compute_shared[((((int)threadIdx.x) + 3456))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 325248) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 3267))];\n if (((int)threadIdx.x) < 256) {\n compute_shared[((((int)threadIdx.x) + 3840))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 325248) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 3630))];\n }\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 121) * 258048) + ((((int)threadIdx.x) >> 5) * 2688)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 384))] = kernel[(((((((((int)blockIdx.x) / 121) * 258048) + ((((int)threadIdx.x) >> 5) * 2688)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32256))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 121) * 258048) + ((((int)threadIdx.x) >> 5) * 2688)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 64512))];\n compute_d_shared[((((int)threadIdx.x) + 1152))] = kernel[(((((((((int)blockIdx.x) / 121) * 258048) + ((((int)threadIdx.x) >> 5) * 2688)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 96768))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 121) * 258048) + ((((int)threadIdx.x) >> 5) * 2688)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 129024))];\n compute_d_shared[((((int)threadIdx.x) + 1920))] = kernel[(((((((((int)blockIdx.x) / 121) * 258048) + ((((int)threadIdx.x) >> 5) * 2688)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 161280))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 121) * 258048) + ((((int)threadIdx.x) >> 5) * 2688)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 193536))];\n compute_d_shared[((((int)threadIdx.x) + 2688))] = kernel[(((((((((int)blockIdx.x) / 121) * 258048) + ((((int)threadIdx.x) >> 5) * 2688)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 225792))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 15)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 16))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 32))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 48))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 64))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 80))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 96))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 112))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2304))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(2)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(4)] * compute_d_shared_local[(3)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(2)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(5)] * compute_d_shared_local[(3)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(2)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(6)] * compute_d_shared_local[(3)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(2)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(7)] * compute_d_shared_local[(3)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)))] = (compute_local[(0)] + bias[((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 371712))] = (compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 371712))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 743424))] = (compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 743424))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 1115136))] = (compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 1115136))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 16))] = (compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 16))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 371728))] = (compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 371728))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 743440))] = (compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 743440))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 1115152))] = (compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 1115152))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 32))] = (compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 32))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 371744))] = (compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 371744))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 743456))] = (compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 743456))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 1115168))] = (compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 1115168))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 48))] = (compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 48))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 371760))] = (compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 371760))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 743472))] = (compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 743472))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 1115184))] = (compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 1115184))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 64))] = (compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 64))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 371776))] = (compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 371776))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 743488))] = (compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 743488))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 1115200))] = (compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 1115200))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 80))] = (compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 80))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 371792))] = (compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 371792))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 743504))] = (compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 743504))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 1115216))] = (compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 1115216))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 96))] = (compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 96))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 371808))] = (compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 371808))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 743520))] = (compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 743520))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 1115232))] = (compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 1115232))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 112))] = (compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 112))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 371824))] = (compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 371824))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 743536))] = (compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 743536))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 1115248))] = (compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 4) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 15)) + 1115248))]);\n}\n", "gridDim": [847, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,3,331,331]_[96,3,3,3]_[128,96,165,165]_relu.json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,3,331,331]_[96,3,3,3]_[128,96,165,165]_relu.json new file mode 100644 index 000000000..fcfcfac00 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,3,331,331]_[96,3,3,3]_[128,96,165,165]_relu.json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 3, 331, 331], "filter_shape": [96, 3, 3, 3], "output_shape": [128, 96, 165, 165], "window_movement_strides": [2, 2], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_3_331_331___96_3_3_3___128_96_165_165__relu", "code": "extern \"C\" __global__ void roller_Convolution__128_3_331_331___96_3_3_3___128_96_165_165__relu(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[1024];\n __shared__ float compute_d_shared[1024];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 4; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = ((((k_outer * 8) + (((int)threadIdx.x) >> 7)) < 27) ? data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 27225) * 328683) + ((((k_outer * 8) + (((int)threadIdx.x) >> 7)) / 9) * 109561)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 27225) / 165) * 662)) + (((((k_outer * 8) + (((int)threadIdx.x) >> 7)) % 9) / 3) * 331)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 165) * 2)) + (((k_outer * 8) + (((int)threadIdx.x) >> 7)) % 3)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 512))] = ((((k_outer * 8) + (((int)threadIdx.x) >> 7)) < 23) ? data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 27225) * 328683) + (((((k_outer * 8) + (((int)threadIdx.x) >> 7)) + 4) / 9) * 109561)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 27225) / 165) * 662)) + ((((((k_outer * 8) + (((int)threadIdx.x) >> 7)) + 4) % 9) / 3) * 331)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 165) * 2)) + ((((k_outer * 8) + (((int)threadIdx.x) >> 7)) + 1) % 3)))] : 0.000000e+00f);\n compute_d_shared[(((int)threadIdx.x))] = ((((k_outer * 8) + (((int)threadIdx.x) & 7)) < 27) ? kernel[(((((((int)threadIdx.x) >> 3) * 27) + (k_outer * 8)) + (((int)threadIdx.x) & 7)))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 512))] = (((((int)threadIdx.x) < 256) && (((k_outer * 8) + (((int)threadIdx.x) & 7)) < 27)) ? kernel[((((((((int)threadIdx.x) >> 3) * 27) + (k_outer * 8)) + (((int)threadIdx.x) & 7)) + 1728))] : 0.000000e+00f);\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 8; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 8) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 8) + k_inner_outer) + 128))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 8) + k_inner_outer) + 256))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 8) + k_inner_outer) + 384))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 8) + k_inner_outer) + 512))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 8) + k_inner_outer) + 640))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 8) + k_inner_outer) + 768))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 8) + k_inner_outer) + 896))];\n if (((k_outer * 8) + k_inner_outer) < 27) {\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n }\n compute[(((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))] = max((compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = max((compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = max((compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = max((compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 55756800))] = max((compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 55756800))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 55756832))] = max((compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 55756832))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 55756864))] = max((compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 55756864))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 55756896))] = max((compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 55756896))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 111513600))] = max((compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 111513600))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 111513632))] = max((compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 111513632))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 111513664))] = max((compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 111513664))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 111513696))] = max((compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 111513696))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 167270400))] = max((compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 167270400))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 167270432))] = max((compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 167270432))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 167270464))] = max((compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 167270464))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 167270496))] = max((compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 167270496))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 223027200))] = max((compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 223027200))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 223027232))] = max((compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 223027232))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 223027264))] = max((compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 223027264))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 223027296))] = max((compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 223027296))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 278784000))] = max((compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 278784000))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 278784032))] = max((compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 278784032))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 278784064))] = max((compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 278784064))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 278784096))] = max((compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 5) * 3484800) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 278784096))]), 0.000000e+00f);\n}\n", "gridDim": [27225, 1, 1], "blockDim": [512, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,336,21,21]_[336,336,1,1]_[128,336,21,21]_bias.json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,336,21,21]_[336,336,1,1]_[128,336,21,21]_bias.json new file mode 100644 index 000000000..d0edf9f60 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,336,21,21]_[336,336,1,1]_[128,336,21,21]_bias.json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 336, 21, 21], "filter_shape": [336, 336, 1, 1], "output_shape": [128, 336, 21, 21], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_336_21_21___336_336_1_1___128_336_21_21_", "code": "extern \"C\" __global__ void roller_Convolution__128_336_21_21___336_336_1_1___128_336_21_21_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[2048];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 11; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 882))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 1764))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 2646))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 3528))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 4410))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 5292))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 6174))];\n compute_shared[((((int)threadIdx.x) + 2048))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 320) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 7056))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 318) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 7938))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2560))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 316) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 8820))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2816))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 314) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 9702))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 312) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 10584))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3328))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 310) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 11466))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3584))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 308) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 12348))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 306) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 13230))] : 0.000000e+00f);\n compute_d_shared[(((int)threadIdx.x))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 256))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2688))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 512))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 320) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 5376))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 768))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 312) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8064))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1024))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 304) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 10752))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1280))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 296) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 13440))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1536))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 288) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16128))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1792))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 280) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 18816))] : 0.000000e+00f);\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 256))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1280))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1792))];\n if (((k_outer * 32) + k_inner_outer) < 336) {\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n }\n compute[((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 32))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 64))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 96))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451584))] = (compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451584))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451616))] = (compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451616))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451648))] = (compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451648))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451680))] = (compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451680))]);\n if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 320) {\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903168))] = (compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903168))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903200))] = (compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903200))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903232))] = (compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903232))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903264))] = (compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903264))]);\n }\n if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 312) {\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354752))] = (compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354752))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354784))] = (compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354784))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354816))] = (compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354816))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354848))] = (compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354848))]);\n }\n if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 304) {\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806336))] = (compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806336))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806368))] = (compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806368))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806400))] = (compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806400))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806432))] = (compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806432))]);\n }\n if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 296) {\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257920))] = (compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257920))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257952))] = (compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257952))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257984))] = (compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257984))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2258016))] = (compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2258016))]);\n }\n if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 288) {\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))] = (compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))] = (compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))] = (compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))] = (compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))]);\n }\n if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 280) {\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161088))] = (compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161088))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161120))] = (compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161120))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161152))] = (compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161152))]);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161184))] = (compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161184))]);\n }\n}\n", "gridDim": [2646, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,336,21,21]_[336,336,1,1]_[128,336,21,21]_relu.json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,336,21,21]_[336,336,1,1]_[128,336,21,21]_relu.json new file mode 100644 index 000000000..a9740700d --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,336,21,21]_[336,336,1,1]_[128,336,21,21]_relu.json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 336, 21, 21], "filter_shape": [336, 336, 1, 1], "output_shape": [128, 336, 21, 21], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_336_21_21___336_336_1_1___128_336_21_21__relu", "code": "extern \"C\" __global__ void roller_Convolution__128_336_21_21___336_336_1_1___128_336_21_21__relu(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[2048];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 11; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 882))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 1764))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 2646))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 3528))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 4410))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 5292))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 6174))];\n compute_shared[((((int)threadIdx.x) + 2048))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 320) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 7056))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 318) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 7938))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2560))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 316) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 8820))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2816))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 314) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 9702))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 312) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 10584))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3328))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 310) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 11466))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3584))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 308) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 12348))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 306) ? data[((((((((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) / 441) * 148176) + (k_outer * 14112)) + ((((int)threadIdx.x) >> 7) * 441)) + ((((((int)blockIdx.x) % 441) * 128) + (((int)threadIdx.x) & 127)) % 441)) + 13230))] : 0.000000e+00f);\n compute_d_shared[(((int)threadIdx.x))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 256))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2688))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 512))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 320) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 5376))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 768))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 312) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8064))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1024))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 304) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 10752))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1280))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 296) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 13440))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1536))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 288) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16128))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1792))] = ((((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 280) && (((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336)) ? kernel[(((((((((int)blockIdx.x) / 441) * 21504) + ((((int)threadIdx.x) >> 5) * 336)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 18816))] : 0.000000e+00f);\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 256))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1280))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1792))];\n if (((k_outer * 32) + k_inner_outer) < 336) {\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n }\n compute[((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)))] = max((compute_local[(0)] + bias[((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = max((compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 32))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = max((compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 64))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = max((compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 96))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451584))] = max((compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451584))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451616))] = max((compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451616))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451648))] = max((compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451648))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451680))] = max((compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 451680))]), 0.000000e+00f);\n if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 320) {\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903168))] = max((compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903168))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903200))] = max((compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903200))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903232))] = max((compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903232))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903264))] = max((compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 903264))]), 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 312) {\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354752))] = max((compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354752))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354784))] = max((compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354784))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354816))] = max((compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354816))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354848))] = max((compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1354848))]), 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 304) {\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806336))] = max((compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806336))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806368))] = max((compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806368))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806400))] = max((compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806400))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806432))] = max((compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 1806432))]), 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 296) {\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257920))] = max((compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257920))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257952))] = max((compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257952))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257984))] = max((compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2257984))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2258016))] = max((compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2258016))]), 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 288) {\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))] = max((compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))] = max((compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))] = max((compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))] = max((compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))]), 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) / 441) * 64) + (((int)threadIdx.x) >> 5)) < 280) {\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161088))] = max((compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161088))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161120))] = max((compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161120))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161152))] = max((compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161152))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161184))] = max((compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 441) * 3612672) + ((((int)threadIdx.x) >> 5) * 56448)) + ((((int)blockIdx.x) % 441) * 128)) + (((int)threadIdx.x) & 31)) + 3161184))]), 0.000000e+00f);\n }\n}\n", "gridDim": [2646, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,336,42,42]_[168,336,1,1]_[128,168,42,42].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,336,42,42]_[168,336,1,1]_[128,168,42,42].json new file mode 100644 index 000000000..45009d8a7 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,336,42,42]_[168,336,1,1]_[128,168,42,42].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 336, 42, 42], "filter_shape": [168, 336, 1, 1], "output_shape": [128, 168, 42, 42], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_336_42_42___168_336_1_1___128_168_42_42_", "code": "extern \"C\" __global__ void roller_Convolution__128_336_42_42___168_336_1_1___128_168_42_42_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[6144];\n float compute_shared_local[4];\n float compute_d_shared_local[16];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 11; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 592704) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 592704) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 5292))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 592704) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 10584))];\n compute_shared[((((int)threadIdx.x) + 1152))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 592704) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 15876))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 592704) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 21168))];\n compute_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 321) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 592704) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 26460))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 318) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 592704) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 31752))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2688))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 315) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 592704) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 37044))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 312) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 592704) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 42336))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3456))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 309) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 592704) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 47628))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 256) {\n compute_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 306) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 592704) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 52920))] : 0.000000e+00f);\n }\n compute_d_shared[(((int)threadIdx.x))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[(((((((int)threadIdx.x) >> 5) * 336) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 384))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[((((((((int)threadIdx.x) >> 5) * 336) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4032))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 768))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[((((((((int)threadIdx.x) >> 5) * 336) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8064))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1152))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[((((((((int)threadIdx.x) >> 5) * 336) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12096))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1536))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[((((((((int)threadIdx.x) >> 5) * 336) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16128))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[((((((((int)threadIdx.x) >> 5) * 336) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 20160))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[((((((((int)threadIdx.x) >> 5) * 336) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24192))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2688))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[((((((((int)threadIdx.x) >> 5) * 336) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 28224))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[((((((((int)threadIdx.x) >> 5) * 336) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32256))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3456))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[((((((((int)threadIdx.x) >> 5) * 336) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 36288))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[((((((((int)threadIdx.x) >> 5) * 336) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 40320))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 4224))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[((((((((int)threadIdx.x) >> 5) * 336) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 44352))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 4608))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[((((((((int)threadIdx.x) >> 5) * 336) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 48384))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 4992))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 336) ? kernel[((((((((int)threadIdx.x) >> 5) * 336) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 52416))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 5376))] = 0.000000e+00f;\n compute_d_shared[((((int)threadIdx.x) + 5760))] = 0.000000e+00f;\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 384))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1152))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1920))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2304))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2688))];\n compute_d_shared_local[(8)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(9)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3456))];\n compute_d_shared_local[(10)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3840))];\n compute_d_shared_local[(11)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 4224))];\n compute_d_shared_local[(12)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 4608))];\n compute_d_shared_local[(13)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 4992))];\n compute_d_shared_local[(14)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 5376))];\n compute_d_shared_local[(15)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 5760))];\n if (((k_outer * 32) + k_inner_outer) < 336) {\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(8)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(0)] * compute_d_shared_local[(9)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(10)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(0)] * compute_d_shared_local[(11)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(12)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(0)] * compute_d_shared_local[(13)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(14)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(0)] * compute_d_shared_local[(15)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(8)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(1)] * compute_d_shared_local[(9)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(10)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(1)] * compute_d_shared_local[(11)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(12)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(1)] * compute_d_shared_local[(13)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(14)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(1)] * compute_d_shared_local[(15)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(8)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(2)] * compute_d_shared_local[(9)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(10)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(2)] * compute_d_shared_local[(11)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(12)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(2)] * compute_d_shared_local[(13)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(14)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(2)] * compute_d_shared_local[(15)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(8)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(3)] * compute_d_shared_local[(9)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(10)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(3)] * compute_d_shared_local[(11)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(12)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(3)] * compute_d_shared_local[(13)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(14)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(3)] * compute_d_shared_local[(15)]));\n }\n }\n }\n compute[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))] = (compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))] = (compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))] = (compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))] = (compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419008))] = (compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419008))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419040))] = (compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419040))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419072))] = (compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419072))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419104))] = (compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419104))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128512))] = (compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128512))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128544))] = (compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128544))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128576))] = (compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128576))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128608))] = (compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128608))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838016))] = (compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838016))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838048))] = (compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838048))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838080))] = (compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838080))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838112))] = (compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838112))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547520))] = (compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547520))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547552))] = (compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547552))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547584))] = (compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547584))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547616))] = (compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547616))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257024))] = (compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257024))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257056))] = (compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257056))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257088))] = (compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257088))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257120))] = (compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257120))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966528))] = (compute_local[(28)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966528))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966560))] = (compute_local[(29)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966560))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966592))] = (compute_local[(30)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966592))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966624))] = (compute_local[(31)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 18966624))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676032))] = (compute_local[(32)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676032))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676064))] = (compute_local[(33)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676064))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676096))] = (compute_local[(34)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676096))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676128))] = (compute_local[(35)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 21676128))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385536))] = (compute_local[(36)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385536))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385568))] = (compute_local[(37)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385568))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385600))] = (compute_local[(38)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385600))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385632))] = (compute_local[(39)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 24385632))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095040))] = (compute_local[(40)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095040))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095072))] = (compute_local[(41)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095072))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095104))] = (compute_local[(42)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095104))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095136))] = (compute_local[(43)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 27095136))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804544))] = (compute_local[(44)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804544))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804576))] = (compute_local[(45)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804576))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804608))] = (compute_local[(46)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804608))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804640))] = (compute_local[(47)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 29804640))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514048))] = (compute_local[(48)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514048))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514080))] = (compute_local[(49)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514080))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514112))] = (compute_local[(50)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514112))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514144))] = (compute_local[(51)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32514144))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223552))] = (compute_local[(52)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223552))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223584))] = (compute_local[(53)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223584))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223616))] = (compute_local[(54)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223616))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223648))] = (compute_local[(55)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 35223648))]);\n}\n", "gridDim": [1764, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,4032,11,11]_[672,4032,1,1]_[128,672,11,11].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,4032,11,11]_[672,4032,1,1]_[128,672,11,11].json new file mode 100644 index 000000000..abe9f90ac --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,4032,11,11]_[672,4032,1,1]_[128,672,11,11].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 4032, 11, 11], "filter_shape": [672, 4032, 1, 1], "output_shape": [128, 672, 11, 11], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_4032_11_11___672_4032_1_1___128_672_11_11_", "code": "extern \"C\" __global__ void roller_Convolution__128_4032_11_11___672_4032_1_1___128_672_11_11_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[2048];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 126; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 487872) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 487872) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 242))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 487872) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 484))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 487872) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 726))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 487872) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 968))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 487872) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 1210))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 487872) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 1452))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 487872) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 1694))];\n compute_shared[((((int)threadIdx.x) + 2048))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 487872) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 1936))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 487872) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 2178))];\n compute_shared[((((int)threadIdx.x) + 2560))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 487872) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 2420))];\n compute_shared[((((int)threadIdx.x) + 2816))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 487872) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 2662))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 487872) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 2904))];\n compute_shared[((((int)threadIdx.x) + 3328))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 487872) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 3146))];\n compute_shared[((((int)threadIdx.x) + 3584))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 487872) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 3388))];\n compute_shared[((((int)threadIdx.x) + 3840))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 487872) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 3630))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 121) * 258048) + ((((int)threadIdx.x) >> 5) * 4032)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 121) * 258048) + ((((int)threadIdx.x) >> 5) * 4032)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32256))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 121) * 258048) + ((((int)threadIdx.x) >> 5) * 4032)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 64512))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 121) * 258048) + ((((int)threadIdx.x) >> 5) * 4032)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 96768))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = (((((((int)blockIdx.x) / 121) * 64) + (((int)threadIdx.x) >> 5)) < 640) ? kernel[(((((((((int)blockIdx.x) / 121) * 258048) + ((((int)threadIdx.x) >> 5) * 4032)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 129024))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1280))] = (((((((int)blockIdx.x) / 121) * 64) + (((int)threadIdx.x) >> 5)) < 632) ? kernel[(((((((((int)blockIdx.x) / 121) * 258048) + ((((int)threadIdx.x) >> 5) * 4032)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 161280))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1536))] = (((((((int)blockIdx.x) / 121) * 64) + (((int)threadIdx.x) >> 5)) < 624) ? kernel[(((((((((int)blockIdx.x) / 121) * 258048) + ((((int)threadIdx.x) >> 5) * 4032)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 193536))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1792))] = (((((((int)blockIdx.x) / 121) * 64) + (((int)threadIdx.x) >> 5)) < 616) ? kernel[(((((((((int)blockIdx.x) / 121) * 258048) + ((((int)threadIdx.x) >> 5) * 4032)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 225792))] : 0.000000e+00f);\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 256))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1280))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1792))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 32))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 64))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 96))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 123904))] = (compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 123904))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 123936))] = (compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 123936))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 123968))] = (compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 123968))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 124000))] = (compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 124000))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 247808))] = (compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 247808))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 247840))] = (compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 247840))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 247872))] = (compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 247872))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 247904))] = (compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 247904))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371712))] = (compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371712))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371744))] = (compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371744))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371776))] = (compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371776))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371808))] = (compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371808))]);\n if ((((((int)blockIdx.x) / 121) * 64) + (((int)threadIdx.x) >> 5)) < 640) {\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 495616))] = (compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 495616))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 495648))] = (compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 495648))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 495680))] = (compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 495680))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 495712))] = (compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 495712))]);\n }\n if ((((((int)blockIdx.x) / 121) * 64) + (((int)threadIdx.x) >> 5)) < 632) {\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 619520))] = (compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 619520))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 619552))] = (compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 619552))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 619584))] = (compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 619584))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 619616))] = (compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 619616))]);\n }\n if ((((((int)blockIdx.x) / 121) * 64) + (((int)threadIdx.x) >> 5)) < 624) {\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743424))] = (compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743424))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743456))] = (compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743456))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743488))] = (compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743488))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743520))] = (compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743520))]);\n }\n if ((((((int)blockIdx.x) / 121) * 64) + (((int)threadIdx.x) >> 5)) < 616) {\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 867328))] = (compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 867328))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 867360))] = (compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 867360))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 867392))] = (compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 867392))]);\n compute[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 867424))] = (compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 121) * 991232) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 867424))]);\n }\n}\n", "gridDim": [1331, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,42,83,83]_[42,42,1,1]_[128,42,83,83].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,42,83,83]_[42,42,1,1]_[128,42,83,83].json new file mode 100644 index 000000000..acacba3f3 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,42,83,83]_[42,42,1,1]_[128,42,83,83].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 42, 83, 83], "filter_shape": [42, 42, 1, 1], "output_shape": [128, 42, 83, 83], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_42_83_83___42_42_1_1___128_42_83_83_", "code": "extern \"C\" __global__ void roller_Convolution__128_42_83_83___42_42_1_1___128_42_83_83_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[2048];\n __shared__ float compute_d_shared[512];\n float compute_shared_local[2];\n float compute_d_shared_local[16];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 6; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = ((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 6889) * 289338) + (k_outer * 55112)) + ((((int)threadIdx.x) >> 8) * 6889)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 6889)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 512))] = (((((k_outer * 8) + (((int)threadIdx.x) >> 8)) < 40) && (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) < 881792)) ? data[(((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 6889) * 289338) + (k_outer * 55112)) + ((((int)threadIdx.x) >> 8) * 6889)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 6889)) + 13778))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1024))] = (((((k_outer * 8) + (((int)threadIdx.x) >> 8)) < 38) && (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) < 881792)) ? data[(((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 6889) * 289338) + (k_outer * 55112)) + ((((int)threadIdx.x) >> 8) * 6889)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 6889)) + 27556))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1536))] = (((((k_outer * 8) + (((int)threadIdx.x) >> 8)) < 36) && (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) < 881792)) ? data[(((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 6889) * 289338) + (k_outer * 55112)) + ((((int)threadIdx.x) >> 8) * 6889)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 6889)) + 41334))] : 0.000000e+00f);\n compute_d_shared[(((int)threadIdx.x))] = (((((int)threadIdx.x) < 336) && (((k_outer * 8) + (((int)threadIdx.x) & 7)) < 42)) ? kernel[(((((((int)threadIdx.x) >> 3) * 42) + (k_outer * 8)) + (((int)threadIdx.x) & 7)))] : 0.000000e+00f);\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 8; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 256) + (((int)threadIdx.x) & 127)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 127)) + 128))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 7) * 8) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 7) * 8) + k_inner_outer) + 32))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 7) * 8) + k_inner_outer) + 64))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 7) * 8) + k_inner_outer) + 96))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 7) * 8) + k_inner_outer) + 128))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 7) * 8) + k_inner_outer) + 160))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 7) * 8) + k_inner_outer) + 192))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 7) * 8) + k_inner_outer) + 224))];\n compute_d_shared_local[(8)] = compute_d_shared[(((((((int)threadIdx.x) >> 7) * 8) + k_inner_outer) + 256))];\n compute_d_shared_local[(9)] = compute_d_shared[(((((((int)threadIdx.x) >> 7) * 8) + k_inner_outer) + 288))];\n compute_d_shared_local[(10)] = compute_d_shared[(((((((int)threadIdx.x) >> 7) * 8) + k_inner_outer) + 320))];\n compute_d_shared_local[(11)] = compute_d_shared[(((((((int)threadIdx.x) >> 7) * 8) + k_inner_outer) + 352))];\n compute_d_shared_local[(12)] = compute_d_shared[(((((((int)threadIdx.x) >> 7) * 8) + k_inner_outer) + 384))];\n compute_d_shared_local[(13)] = compute_d_shared[(((((((int)threadIdx.x) >> 7) * 8) + k_inner_outer) + 416))];\n compute_d_shared_local[(14)] = compute_d_shared[(((((((int)threadIdx.x) >> 7) * 8) + k_inner_outer) + 448))];\n compute_d_shared_local[(15)] = compute_d_shared[(((((((int)threadIdx.x) >> 7) * 8) + k_inner_outer) + 480))];\n if (((k_outer * 8) + k_inner_outer) < 42) {\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(8)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(0)] * compute_d_shared_local[(9)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(10)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(0)] * compute_d_shared_local[(11)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(12)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(0)] * compute_d_shared_local[(13)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(14)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(0)] * compute_d_shared_local[(15)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(8)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(1)] * compute_d_shared_local[(9)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(10)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(1)] * compute_d_shared_local[(11)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(12)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(1)] * compute_d_shared_local[(13)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(14)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(1)] * compute_d_shared_local[(15)]));\n }\n }\n }\n compute[(((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)))] = (compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)))]);\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 127)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 128))] = (compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 128))]);\n }\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 3527168))] = (compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 3527168))]);\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 127)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 3527296))] = (compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 3527296))]);\n }\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 7054336))] = (compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 7054336))]);\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 127)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 7054464))] = (compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 7054464))]);\n }\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 10581504))] = (compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 10581504))]);\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 127)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 10581632))] = (compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 10581632))]);\n }\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 14108672))] = (compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 14108672))]);\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 127)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 14108800))] = (compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 14108800))]);\n }\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 17635840))] = (compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 17635840))]);\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 127)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 17635968))] = (compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 17635968))]);\n }\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 21163008))] = (compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 21163008))]);\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 127)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 21163136))] = (compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 21163136))]);\n }\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 24690176))] = (compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 24690176))]);\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 127)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 24690304))] = (compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 24690304))]);\n }\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 28217344))] = (compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 28217344))]);\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 127)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 28217472))] = (compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 28217472))]);\n }\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 31744512))] = (compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 31744512))]);\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 127)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 31744640))] = (compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 31744640))]);\n }\n if (((int)threadIdx.x) < 256) {\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 35271680))] = (compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 35271680))]);\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 127)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 35271808))] = (compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 7) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 127)) + 35271808))]);\n }\n }\n}\n", "gridDim": [3445, 1, 1], "blockDim": [512, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,672,11,11]_[672,672,1,1]_[128,672,11,11]_bias.json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,672,11,11]_[672,672,1,1]_[128,672,11,11]_bias.json new file mode 100644 index 000000000..490463ece --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,672,11,11]_[672,672,1,1]_[128,672,11,11]_bias.json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 672, 11, 11], "filter_shape": [672, 672, 1, 1], "output_shape": [128, 672, 11, 11], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_672_11_11___672_672_1_1___128_672_11_11_", "code": "extern \"C\" __global__ void roller_Convolution__128_672_11_11___672_672_1_1___128_672_11_11_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[3072];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 21; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 363))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 726))];\n compute_shared[((((int)threadIdx.x) + 1152))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 1089))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 1452))];\n compute_shared[((((int)threadIdx.x) + 1920))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 1815))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 2178))];\n compute_shared[((((int)threadIdx.x) + 2688))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 2541))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 2904))];\n compute_shared[((((int)threadIdx.x) + 3456))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 3267))];\n if (((int)threadIdx.x) < 256) {\n compute_shared[((((int)threadIdx.x) + 3840))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 3630))];\n }\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 384))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8064))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16128))];\n compute_d_shared[((((int)threadIdx.x) + 1152))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24192))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32256))];\n compute_d_shared[((((int)threadIdx.x) + 1920))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 40320))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 48384))];\n compute_d_shared[((((int)threadIdx.x) + 2688))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 56448))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 384))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1152))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1920))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2304))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2688))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185856))] = (compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185856))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371712))] = (compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371712))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557568))] = (compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557568))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743424))] = (compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743424))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929280))] = (compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929280))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115136))] = (compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115136))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1300992))] = (compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1300992))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 32))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185888))] = (compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185888))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371744))] = (compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371744))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557600))] = (compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557600))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743456))] = (compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743456))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929312))] = (compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929312))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115168))] = (compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115168))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301024))] = (compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301024))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 64))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185920))] = (compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185920))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371776))] = (compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371776))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557632))] = (compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557632))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743488))] = (compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743488))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929344))] = (compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929344))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115200))] = (compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115200))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301056))] = (compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301056))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 96))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185952))] = (compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185952))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371808))] = (compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371808))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557664))] = (compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557664))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743520))] = (compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743520))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929376))] = (compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929376))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115232))] = (compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115232))]);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301088))] = (compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301088))]);\n}\n", "gridDim": [847, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,672,11,11]_[672,672,1,1]_[128,672,11,11]_relu.json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,672,11,11]_[672,672,1,1]_[128,672,11,11]_relu.json new file mode 100644 index 000000000..afa015e59 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,672,11,11]_[672,672,1,1]_[128,672,11,11]_relu.json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 672, 11, 11], "filter_shape": [672, 672, 1, 1], "output_shape": [128, 672, 11, 11], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_672_11_11___672_672_1_1___128_672_11_11__relu", "code": "extern \"C\" __global__ void roller_Convolution__128_672_11_11___672_672_1_1___128_672_11_11__relu(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[3072];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 21; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 363))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 726))];\n compute_shared[((((int)threadIdx.x) + 1152))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 1089))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 1452))];\n compute_shared[((((int)threadIdx.x) + 1920))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 1815))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 2178))];\n compute_shared[((((int)threadIdx.x) + 2688))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 2541))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 2904))];\n compute_shared[((((int)threadIdx.x) + 3456))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 3267))];\n if (((int)threadIdx.x) < 256) {\n compute_shared[((((int)threadIdx.x) + 3840))] = data[((((((((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) / 121) * 81312) + (k_outer * 3872)) + ((((int)threadIdx.x) >> 7) * 121)) + ((((((int)blockIdx.x) % 121) * 128) + (((int)threadIdx.x) & 127)) % 121)) + 3630))];\n }\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 384))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8064))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16128))];\n compute_d_shared[((((int)threadIdx.x) + 1152))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24192))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32256))];\n compute_d_shared[((((int)threadIdx.x) + 1920))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 40320))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 48384))];\n compute_d_shared[((((int)threadIdx.x) + 2688))] = kernel[(((((((((int)blockIdx.x) / 121) * 64512) + ((((int)threadIdx.x) >> 5) * 672)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 56448))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 384))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1152))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1920))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2304))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2688))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)))] = max((compute_local[(0)] + bias[((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185856))] = max((compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185856))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371712))] = max((compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371712))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557568))] = max((compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557568))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743424))] = max((compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743424))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929280))] = max((compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929280))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115136))] = max((compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115136))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1300992))] = max((compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1300992))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = max((compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 32))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185888))] = max((compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185888))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371744))] = max((compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371744))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557600))] = max((compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557600))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743456))] = max((compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743456))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929312))] = max((compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929312))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115168))] = max((compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115168))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301024))] = max((compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301024))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = max((compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 64))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185920))] = max((compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185920))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371776))] = max((compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371776))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557632))] = max((compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557632))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743488))] = max((compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743488))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929344))] = max((compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929344))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115200))] = max((compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115200))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301056))] = max((compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301056))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = max((compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 96))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185952))] = max((compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 185952))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371808))] = max((compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 371808))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557664))] = max((compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 557664))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743520))] = max((compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 743520))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929376))] = max((compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 929376))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115232))] = max((compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1115232))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301088))] = max((compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 121) * 1486848) + ((((int)threadIdx.x) >> 5) * 15488)) + ((((int)blockIdx.x) % 121) * 128)) + (((int)threadIdx.x) & 31)) + 1301088))]), 0.000000e+00f);\n}\n", "gridDim": [847, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,84,42,42]_[84,84,1,1]_[128,84,42,42]_bias.json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,84,42,42]_[84,84,1,1]_[128,84,42,42]_bias.json new file mode 100644 index 000000000..a26c21e47 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,84,42,42]_[84,84,1,1]_[128,84,42,42]_bias.json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 84, 42, 42], "filter_shape": [84, 84, 1, 1], "output_shape": [128, 84, 42, 42], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_84_42_42___84_84_1_1___128_84_42_42_", "code": "extern \"C\" __global__ void roller_Convolution__128_84_42_42___84_84_1_1___128_84_42_42_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[3072];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 3; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 5292))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 10584))];\n compute_shared[((((int)threadIdx.x) + 1152))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 15876))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 21168))];\n compute_shared[((((int)threadIdx.x) + 1920))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 26460))];\n compute_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 66) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 31752))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2688))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 63) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 37044))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 60) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 42336))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3456))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 57) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 47628))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 256) {\n compute_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 54) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 52920))] : 0.000000e+00f);\n }\n compute_d_shared[(((int)threadIdx.x))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[(((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 384))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1008))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 768))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2016))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1152))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 3024))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1536))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4032))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 5040))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 6048))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2688))] = 0.000000e+00f;\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 384))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1152))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1920))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2304))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2688))];\n if (((k_outer * 32) + k_inner_outer) < 84) {\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n }\n compute[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))] = (compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))] = (compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))] = (compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))] = (compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419008))] = (compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419008))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419040))] = (compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419040))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419072))] = (compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419072))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419104))] = (compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419104))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128512))] = (compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128512))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128544))] = (compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128544))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128576))] = (compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128576))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128608))] = (compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128608))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838016))] = (compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838016))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838048))] = (compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838048))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838080))] = (compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838080))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838112))] = (compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838112))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547520))] = (compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547520))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547552))] = (compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547552))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547584))] = (compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547584))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547616))] = (compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547616))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257024))] = (compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257024))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257056))] = (compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257056))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257088))] = (compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257088))]);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257120))] = (compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257120))]);\n}\n", "gridDim": [1764, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,84,42,42]_[84,84,1,1]_[128,84,42,42]_relu.json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,84,42,42]_[84,84,1,1]_[128,84,42,42]_relu.json new file mode 100644 index 000000000..bca8347f7 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,84,42,42]_[84,84,1,1]_[128,84,42,42]_relu.json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 84, 42, 42], "filter_shape": [84, 84, 1, 1], "output_shape": [128, 84, 42, 42], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_84_42_42___84_84_1_1___128_84_42_42__relu", "code": "extern \"C\" __global__ void roller_Convolution__128_84_42_42___84_84_1_1___128_84_42_42__relu(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[3072];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 3; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 5292))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 10584))];\n compute_shared[((((int)threadIdx.x) + 1152))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 15876))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 21168))];\n compute_shared[((((int)threadIdx.x) + 1920))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 26460))];\n compute_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 66) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 31752))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2688))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 63) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 37044))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3072))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 60) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 42336))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3456))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 57) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 47628))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 256) {\n compute_shared[((((int)threadIdx.x) + 3840))] = ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) < 54) ? data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 1764) * 148176) + (k_outer * 56448)) + ((((int)threadIdx.x) >> 7) * 1764)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 1764)) + 52920))] : 0.000000e+00f);\n }\n compute_d_shared[(((int)threadIdx.x))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[(((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 384))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1008))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 768))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2016))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1152))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 3024))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1536))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4032))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1920))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 5040))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2304))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 84) ? kernel[((((((((int)threadIdx.x) >> 5) * 84) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 6048))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 2688))] = 0.000000e+00f;\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 384))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1152))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1920))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2304))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2688))];\n if (((k_outer * 32) + k_inner_outer) < 84) {\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n }\n compute[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))] = max((compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = max((compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = max((compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = max((compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))] = max((compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709504))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))] = max((compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709536))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))] = max((compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709568))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))] = max((compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 2709600))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419008))] = max((compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419008))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419040))] = max((compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419040))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419072))] = max((compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419072))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419104))] = max((compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 5419104))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128512))] = max((compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128512))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128544))] = max((compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128544))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128576))] = max((compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128576))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128608))] = max((compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 8128608))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838016))] = max((compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838016))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838048))] = max((compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838048))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838080))] = max((compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838080))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838112))] = max((compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 10838112))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547520))] = max((compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547520))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547552))] = max((compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547552))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547584))] = max((compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547584))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547616))] = max((compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 13547616))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257024))] = max((compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257024))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257056))] = max((compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257056))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257088))] = max((compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257088))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257120))] = max((compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 5) * 225792) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16257120))]), 0.000000e+00f);\n}\n", "gridDim": [1764, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,96,165,165]_[42,96,1,1]_[128,42,165,165].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,96,165,165]_[42,96,1,1]_[128,42,165,165].json new file mode 100644 index 000000000..e8ffac6d5 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,96,165,165]_[42,96,1,1]_[128,42,165,165].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 96, 165, 165], "filter_shape": [42, 96, 1, 1], "output_shape": [128, 42, 165, 165], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_96_165_165___42_96_1_1___128_42_165_165_", "code": "extern \"C\" __global__ void roller_Convolution__128_96_165_165___42_96_1_1___128_42_165_165_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[6144];\n __shared__ float compute_d_shared[2048];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 3; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) / 27225) * 2613600) + (k_outer * 871200)) + ((((int)threadIdx.x) / 192) * 27225)) + (((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) % 27225)))];\n compute_shared[((((int)threadIdx.x) + 384))] = data[(((((((((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) / 27225) * 2613600) + (k_outer * 871200)) + ((((int)threadIdx.x) / 192) * 27225)) + (((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) % 27225)) + 54450))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) / 27225) * 2613600) + (k_outer * 871200)) + ((((int)threadIdx.x) / 192) * 27225)) + (((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) % 27225)) + 108900))];\n compute_shared[((((int)threadIdx.x) + 1152))] = data[(((((((((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) / 27225) * 2613600) + (k_outer * 871200)) + ((((int)threadIdx.x) / 192) * 27225)) + (((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) % 27225)) + 163350))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) / 27225) * 2613600) + (k_outer * 871200)) + ((((int)threadIdx.x) / 192) * 27225)) + (((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) % 27225)) + 217800))];\n compute_shared[((((int)threadIdx.x) + 1920))] = data[(((((((((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) / 27225) * 2613600) + (k_outer * 871200)) + ((((int)threadIdx.x) / 192) * 27225)) + (((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) % 27225)) + 272250))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[(((((((((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) / 27225) * 2613600) + (k_outer * 871200)) + ((((int)threadIdx.x) / 192) * 27225)) + (((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) % 27225)) + 326700))];\n compute_shared[((((int)threadIdx.x) + 2688))] = data[(((((((((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) / 27225) * 2613600) + (k_outer * 871200)) + ((((int)threadIdx.x) / 192) * 27225)) + (((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) % 27225)) + 381150))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[(((((((((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) / 27225) * 2613600) + (k_outer * 871200)) + ((((int)threadIdx.x) / 192) * 27225)) + (((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) % 27225)) + 435600))];\n compute_shared[((((int)threadIdx.x) + 3456))] = data[(((((((((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) / 27225) * 2613600) + (k_outer * 871200)) + ((((int)threadIdx.x) / 192) * 27225)) + (((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) % 27225)) + 490050))];\n compute_shared[((((int)threadIdx.x) + 3840))] = data[(((((((((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) / 27225) * 2613600) + (k_outer * 871200)) + ((((int)threadIdx.x) / 192) * 27225)) + (((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) % 27225)) + 544500))];\n compute_shared[((((int)threadIdx.x) + 4224))] = data[(((((((((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) / 27225) * 2613600) + (k_outer * 871200)) + ((((int)threadIdx.x) / 192) * 27225)) + (((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) % 27225)) + 598950))];\n compute_shared[((((int)threadIdx.x) + 4608))] = data[(((((((((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) / 27225) * 2613600) + (k_outer * 871200)) + ((((int)threadIdx.x) / 192) * 27225)) + (((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) % 27225)) + 653400))];\n compute_shared[((((int)threadIdx.x) + 4992))] = data[(((((((((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) / 27225) * 2613600) + (k_outer * 871200)) + ((((int)threadIdx.x) / 192) * 27225)) + (((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) % 27225)) + 707850))];\n compute_shared[((((int)threadIdx.x) + 5376))] = data[(((((((((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) / 27225) * 2613600) + (k_outer * 871200)) + ((((int)threadIdx.x) / 192) * 27225)) + (((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) % 27225)) + 762300))];\n compute_shared[((((int)threadIdx.x) + 5760))] = data[(((((((((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) / 27225) * 2613600) + (k_outer * 871200)) + ((((int)threadIdx.x) / 192) * 27225)) + (((((int)blockIdx.x) * 192) + (((int)threadIdx.x) % 192)) % 27225)) + 816750))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[(((((((int)threadIdx.x) >> 5) * 96) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 384))] = kernel[((((((((int)threadIdx.x) >> 5) * 96) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1152))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[((((((((int)threadIdx.x) >> 5) * 96) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2304))];\n compute_d_shared[((((int)threadIdx.x) + 1152))] = ((((int)threadIdx.x) < 192) ? kernel[((((((((int)threadIdx.x) >> 5) * 96) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 3456))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1536))] = 0.000000e+00f;\n if (((int)threadIdx.x) < 128) {\n compute_d_shared[((((int)threadIdx.x) + 1920))] = 0.000000e+00f;\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 192) + (((int)threadIdx.x) % 48)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 192) + (((int)threadIdx.x) % 48)) + 48))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 192) + (((int)threadIdx.x) % 48)) + 96))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 192) + (((int)threadIdx.x) % 48)) + 144))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) / 48) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) / 48) * 32) + k_inner_outer) + 256))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) / 48) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) / 48) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) / 48) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) / 48) * 32) + k_inner_outer) + 1280))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) / 48) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) / 48) * 32) + k_inner_outer) + 1792))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n compute[(((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)))] = (compute_local[(0)] + bias[(((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 48))] = (compute_local[(1)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 48))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 96))] = (compute_local[(2)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 96))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 144))] = (compute_local[(3)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 144))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 27878400))] = (compute_local[(4)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 27878400))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 27878448))] = (compute_local[(5)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 27878448))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 27878496))] = (compute_local[(6)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 27878496))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 27878544))] = (compute_local[(7)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 27878544))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 55756800))] = (compute_local[(8)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 55756800))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 55756848))] = (compute_local[(9)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 55756848))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 55756896))] = (compute_local[(10)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 55756896))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 55756944))] = (compute_local[(11)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 55756944))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 83635200))] = (compute_local[(12)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 83635200))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 83635248))] = (compute_local[(13)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 83635248))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 83635296))] = (compute_local[(14)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 83635296))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 83635344))] = (compute_local[(15)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 83635344))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 111513600))] = (compute_local[(16)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 111513600))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 111513648))] = (compute_local[(17)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 111513648))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 111513696))] = (compute_local[(18)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 111513696))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 111513744))] = (compute_local[(19)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 111513744))]);\n if (((int)threadIdx.x) < 96) {\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 139392000))] = (compute_local[(20)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 139392000))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 139392048))] = (compute_local[(21)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 139392048))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 139392096))] = (compute_local[(22)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 139392096))]);\n compute[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 139392144))] = (compute_local[(23)] + bias[((((((((int)threadIdx.x) / 48) * 3484800) + (((int)blockIdx.x) * 192)) + (((int)threadIdx.x) % 48)) + 139392144))]);\n }\n}\n", "gridDim": [18150, 1, 1], "blockDim": [384, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,96,83,83]_[42,96,1,1]_[128,42,83,83].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,96,83,83]_[42,96,1,1]_[128,42,83,83].json new file mode 100644 index 000000000..5dc83284d --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Convolution_[128,96,83,83]_[42,96,1,1]_[128,42,83,83].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 96, 83, 83], "filter_shape": [42, 96, 1, 1], "output_shape": [128, 42, 83, 83], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Convolution", "tvm_func_name": "roller_Convolution__128_96_83_83___42_96_1_1___128_42_83_83_", "code": "extern \"C\" __global__ void roller_Convolution__128_96_83_83___42_96_1_1___128_42_83_83_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float compute_local[64];\n __shared__ float compute_shared[8192];\n __shared__ float compute_d_shared[2048];\n float compute_shared_local[4];\n float compute_d_shared_local[16];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 3; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[(((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 256))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 6889))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 512))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 13778))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 768))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 20667))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1024))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 27556))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1280))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 34445))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1536))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 41334))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 1792))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 48223))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2048))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 55112))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2304))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 62001))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2560))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 68890))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 2816))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 75779))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3072))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 82668))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3328))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 89557))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3584))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 96446))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 3840))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 103335))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 4096))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 110224))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 4352))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 117113))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 4608))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 124002))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 4864))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 130891))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 5120))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 137780))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 5376))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 144669))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 5632))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 151558))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 5888))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 158447))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 6144))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 165336))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 6400))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 172225))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 6656))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 179114))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 6912))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 186003))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 7168))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 192892))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 7424))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 199781))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 7680))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 206670))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 7936))] = ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) < 881792) ? data[((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 6889) * 661344) + (k_outer * 220448)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 6889)) + 213559))] : 0.000000e+00f);\n compute_d_shared[(((int)threadIdx.x))] = kernel[(((((((int)threadIdx.x) >> 5) * 96) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[((((((((int)threadIdx.x) >> 5) * 96) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 768))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[((((((((int)threadIdx.x) >> 5) * 96) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1536))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[((((((((int)threadIdx.x) >> 5) * 96) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2304))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[((((((((int)threadIdx.x) >> 5) * 96) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 3072))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = ((((int)threadIdx.x) < 64) ? kernel[((((((((int)threadIdx.x) >> 5) * 96) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 3840))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1536))] = 0.000000e+00f;\n compute_d_shared[((((int)threadIdx.x) + 1792))] = 0.000000e+00f;\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 256) + (((int)threadIdx.x) & 63)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 63)) + 64))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 63)) + 128))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 63)) + 192))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 6) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 6) * 32) + k_inner_outer) + 128))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 6) * 32) + k_inner_outer) + 256))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 6) * 32) + k_inner_outer) + 384))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 6) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 6) * 32) + k_inner_outer) + 640))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 6) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 6) * 32) + k_inner_outer) + 896))];\n compute_d_shared_local[(8)] = compute_d_shared[(((((((int)threadIdx.x) >> 6) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(9)] = compute_d_shared[(((((((int)threadIdx.x) >> 6) * 32) + k_inner_outer) + 1152))];\n compute_d_shared_local[(10)] = compute_d_shared[(((((((int)threadIdx.x) >> 6) * 32) + k_inner_outer) + 1280))];\n compute_d_shared_local[(11)] = compute_d_shared[(((((((int)threadIdx.x) >> 6) * 32) + k_inner_outer) + 1408))];\n compute_d_shared_local[(12)] = compute_d_shared[(((((((int)threadIdx.x) >> 6) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(13)] = compute_d_shared[(((((((int)threadIdx.x) >> 6) * 32) + k_inner_outer) + 1664))];\n compute_d_shared_local[(14)] = compute_d_shared[(((((((int)threadIdx.x) >> 6) * 32) + k_inner_outer) + 1792))];\n compute_d_shared_local[(15)] = compute_d_shared[(((((((int)threadIdx.x) >> 6) * 32) + k_inner_outer) + 1920))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(8)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(0)] * compute_d_shared_local[(9)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(10)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(0)] * compute_d_shared_local[(11)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(12)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(0)] * compute_d_shared_local[(13)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(14)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(0)] * compute_d_shared_local[(15)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(8)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(1)] * compute_d_shared_local[(9)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(10)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(1)] * compute_d_shared_local[(11)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(12)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(1)] * compute_d_shared_local[(13)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(14)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(1)] * compute_d_shared_local[(15)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(8)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(2)] * compute_d_shared_local[(9)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(10)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(2)] * compute_d_shared_local[(11)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(12)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(2)] * compute_d_shared_local[(13)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(14)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(2)] * compute_d_shared_local[(15)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(8)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(3)] * compute_d_shared_local[(9)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(10)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(3)] * compute_d_shared_local[(11)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(12)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(3)] * compute_d_shared_local[(13)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(14)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(3)] * compute_d_shared_local[(15)]));\n }\n }\n compute[(((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)))] = compute_local[(0)];\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 64))] = compute_local[(1)];\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 128))] = compute_local[(2)];\n }\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881600) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 192))] = compute_local[(3)];\n }\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 3527168))] = compute_local[(4)];\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 3527232))] = compute_local[(5)];\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 3527296))] = compute_local[(6)];\n }\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881600) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 3527360))] = compute_local[(7)];\n }\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 7054336))] = compute_local[(8)];\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 7054400))] = compute_local[(9)];\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 7054464))] = compute_local[(10)];\n }\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881600) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 7054528))] = compute_local[(11)];\n }\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 10581504))] = compute_local[(12)];\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 10581568))] = compute_local[(13)];\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 10581632))] = compute_local[(14)];\n }\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881600) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 10581696))] = compute_local[(15)];\n }\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 14108672))] = compute_local[(16)];\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 14108736))] = compute_local[(17)];\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 14108800))] = compute_local[(18)];\n }\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881600) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 14108864))] = compute_local[(19)];\n }\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 17635840))] = compute_local[(20)];\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 17635904))] = compute_local[(21)];\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 17635968))] = compute_local[(22)];\n }\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881600) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 17636032))] = compute_local[(23)];\n }\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 21163008))] = compute_local[(24)];\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 21163072))] = compute_local[(25)];\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 21163136))] = compute_local[(26)];\n }\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881600) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 21163200))] = compute_local[(27)];\n }\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 24690176))] = compute_local[(28)];\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 24690240))] = compute_local[(29)];\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 24690304))] = compute_local[(30)];\n }\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881600) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 24690368))] = compute_local[(31)];\n }\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 28217344))] = compute_local[(32)];\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 28217408))] = compute_local[(33)];\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 28217472))] = compute_local[(34)];\n }\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881600) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 28217536))] = compute_local[(35)];\n }\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 31744512))] = compute_local[(36)];\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 31744576))] = compute_local[(37)];\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 31744640))] = compute_local[(38)];\n }\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881600) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 31744704))] = compute_local[(39)];\n }\n if (((int)threadIdx.x) < 128) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 35271680))] = compute_local[(40)];\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 35271744))] = compute_local[(41)];\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881664) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 35271808))] = compute_local[(42)];\n }\n if (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 63)) < 881600) {\n compute[((((((((int)threadIdx.x) >> 6) * 881792) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 63)) + 35271872))] = compute_local[(43)];\n }\n }\n}\n", "gridDim": [3445, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,168,42,42]_[3,3,168,1]_[128,168,42,42].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,168,42,42]_[3,3,168,1]_[128,168,42,42].json new file mode 100644 index 000000000..658f77a28 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,168,42,42]_[3,3,168,1]_[128,168,42,42].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 168, 42, 42], "filter_shape": [168, 1, 3, 3], "output_shape": [128, 168, 42, 42], "window_movement_strides": [1, 1], "padding_below_diff": [1, 1], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_168_42_42___3_3_168_1___128_168_42_42_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_168_42_42___3_3_168_1___128_168_42_42_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[2];\n __shared__ float PaddedInput_shared[704];\n __shared__ float compute_shared[9];\n float PaddedInput_shared_local[2];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n DepthwiseConv2d_local[(1)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = ((((1 <= (((((int)blockIdx.x) % 7) * 6) + (((int)threadIdx.x) / 44))) && (1 <= (((int)threadIdx.x) % 44))) && ((((int)threadIdx.x) % 44) < 43)) ? data[(((((((((int)blockIdx.x) / 1176) * 592704) + ((((int)blockIdx.x) % 1176) * 252)) + ((((int)threadIdx.x) / 44) * 42)) + (((int)threadIdx.x) % 44)) - 43))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 288))] = (((((1 <= (((((int)blockIdx.x) % 7) * 6) + (((((int)threadIdx.x) + 288) % 352) / 44))) && ((((((int)blockIdx.x) % 7) * 6) + (((((int)threadIdx.x) + 288) % 352) / 44)) < 43)) && (1 <= ((((int)threadIdx.x) + 24) % 44))) && (((((int)threadIdx.x) + 24) % 44) < 43)) ? data[((((((((((int)blockIdx.x) / 1176) * 592704) + (((((int)threadIdx.x) + 288) / 352) * 296352)) + ((((int)blockIdx.x) % 1176) * 252)) + ((((((int)threadIdx.x) + 288) % 352) / 44) * 42)) + ((((int)threadIdx.x) + 24) % 44)) - 43))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 128) {\n PaddedInput_shared[((((int)threadIdx.x) + 576))] = (((((((((int)blockIdx.x) % 7) * 6) + ((((int)threadIdx.x) + 224) / 44)) < 43) && (1 <= ((((int)threadIdx.x) + 4) % 44))) && (((((int)threadIdx.x) + 4) % 44) < 43)) ? data[((((((((((int)blockIdx.x) / 1176) * 592704) + (((((int)threadIdx.x) + 576) / 352) * 296352)) + ((((int)blockIdx.x) % 1176) * 252)) + (((((int)threadIdx.x) + 224) / 44) * 42)) + ((((int)threadIdx.x) + 4) % 44)) - 43))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 9) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 1176) / 7) * 9) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 9; ++k_inner_outer) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[(((((((((int)threadIdx.x) / 144) * 352) + (((((int)threadIdx.x) % 144) / 24) * 44)) + ((k_inner_outer / 3) * 44)) + (((int)threadIdx.x) % 24)) + (k_inner_outer % 3)))];\n if (((((int)threadIdx.x) % 24) + (k_inner_outer % 3)) < 20) {\n PaddedInput_shared_local[(1)] = PaddedInput_shared[((((((((((int)threadIdx.x) / 144) * 352) + (((((int)threadIdx.x) % 144) / 24) * 44)) + ((k_inner_outer / 3) * 44)) + (((int)threadIdx.x) % 24)) + (k_inner_outer % 3)) + 24))];\n }\n compute_shared_local[(0)] = compute_shared[(k_inner_outer)];\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n if ((((int)threadIdx.x) % 24) < 18) {\n DepthwiseConv2d_local[(1)] = (DepthwiseConv2d_local[(1)] + (PaddedInput_shared_local[(1)] * compute_shared_local[(0)]));\n }\n }\n compute[(((((((((int)blockIdx.x) / 1176) * 592704) + ((((int)threadIdx.x) / 144) * 296352)) + ((((int)blockIdx.x) % 1176) * 252)) + (((((int)threadIdx.x) % 144) / 24) * 42)) + (((int)threadIdx.x) % 24)))] = DepthwiseConv2d_local[(0)];\n if ((((int)threadIdx.x) % 24) < 18) {\n compute[((((((((((int)blockIdx.x) / 1176) * 592704) + ((((int)threadIdx.x) / 144) * 296352)) + ((((int)blockIdx.x) % 1176) * 252)) + (((((int)threadIdx.x) % 144) / 24) * 42)) + (((int)threadIdx.x) % 24)) + 24))] = DepthwiseConv2d_local[(1)];\n }\n}\n", "gridDim": [75264, 1, 1], "blockDim": [288, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,168,42,42]_[5,5,168,1]_[128,168,42,42].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,168,42,42]_[5,5,168,1]_[128,168,42,42].json new file mode 100644 index 000000000..5e7b0270a --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,168,42,42]_[5,5,168,1]_[128,168,42,42].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 168, 42, 42], "filter_shape": [168, 1, 5, 5], "output_shape": [128, 168, 42, 42], "window_movement_strides": [1, 1], "padding_below_diff": [2, 2], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_168_42_42___5_5_168_1___128_168_42_42_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_168_42_42___5_5_168_1___128_168_42_42_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[1];\n __shared__ float PaddedInput_shared[460];\n __shared__ float compute_shared[25];\n float PaddedInput_shared_local[1];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = ((((2 <= (((((int)blockIdx.x) % 7) * 6) + (((int)threadIdx.x) / 46))) && (2 <= (((int)threadIdx.x) % 46))) && ((((int)threadIdx.x) % 46) < 44)) ? data[(((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 46) * 42)) + (((int)threadIdx.x) % 46)) - 86))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 172) {\n PaddedInput_shared[((((int)threadIdx.x) + 288))] = (((((((((int)blockIdx.x) % 7) * 6) + ((((int)threadIdx.x) + 288) / 46)) < 44) && (2 <= ((((int)threadIdx.x) + 12) % 46))) && (((((int)threadIdx.x) + 12) % 46) < 44)) ? data[(((((((int)blockIdx.x) * 252) + (((((int)threadIdx.x) + 288) / 46) * 42)) + ((((int)threadIdx.x) + 12) % 46)) - 86))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 25) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 1176) / 7) * 25) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 25; ++k_inner_outer) {\n if (((((int)threadIdx.x) % 48) + (k_inner_outer % 5)) < 46) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[((((((((int)threadIdx.x) / 48) * 46) + ((k_inner_outer / 5) * 46)) + (((int)threadIdx.x) % 48)) + (k_inner_outer % 5)))];\n }\n compute_shared_local[(0)] = compute_shared[(k_inner_outer)];\n if ((((int)threadIdx.x) % 48) < 42) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n }\n }\n if ((((int)threadIdx.x) % 48) < 42) {\n compute[((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 48) * 42)) + (((int)threadIdx.x) % 48)))] = DepthwiseConv2d_local[(0)];\n }\n}\n", "gridDim": [150528, 1, 1], "blockDim": [288, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,21,21]_[3,3,336,1]_[128,336,21,21].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,21,21]_[3,3,336,1]_[128,336,21,21].json new file mode 100644 index 000000000..b797c1283 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,21,21]_[3,3,336,1]_[128,336,21,21].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 336, 21, 21], "filter_shape": [336, 1, 3, 3], "output_shape": [128, 336, 21, 21], "window_movement_strides": [1, 1], "padding_below_diff": [1, 1], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_336_21_21___3_3_336_1___128_336_21_21_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_336_21_21___3_3_336_1___128_336_21_21_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[1];\n __shared__ float PaddedInput_shared[322];\n __shared__ float compute_shared[9];\n float PaddedInput_shared_local[1];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n if ((((((int)blockIdx.x) & 1) * 12) + (((int)threadIdx.x) / 23)) < 23) {\n PaddedInput_shared[(((int)threadIdx.x))] = (((((1 <= (((((int)blockIdx.x) & 1) * 12) + (((int)threadIdx.x) / 23))) && ((((((int)blockIdx.x) & 1) * 12) + (((int)threadIdx.x) / 23)) < 22)) && (1 <= (((int)threadIdx.x) % 23))) && ((((int)threadIdx.x) % 23) < 22)) ? data[(((((((((int)blockIdx.x) >> 1) * 441) + ((((int)blockIdx.x) & 1) * 252)) + ((((int)threadIdx.x) / 23) * 21)) + (((int)threadIdx.x) % 23)) - 22))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 34) {\n if ((((((int)blockIdx.x) & 1) * 12) + ((((int)threadIdx.x) + 288) / 23)) < 23) {\n PaddedInput_shared[((((int)threadIdx.x) + 288))] = (((((((((int)blockIdx.x) & 1) * 12) + ((((int)threadIdx.x) + 288) / 23)) < 22) && (1 <= ((((int)threadIdx.x) + 12) % 23))) && (((((int)threadIdx.x) + 12) % 23) < 22)) ? data[(((((((((int)blockIdx.x) >> 1) * 441) + ((((int)blockIdx.x) & 1) * 252)) + (((((int)threadIdx.x) + 288) / 23) * 21)) + ((((int)threadIdx.x) + 12) % 23)) - 22))] : 0.000000e+00f);\n }\n }\n if (((int)threadIdx.x) < 9) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 672) >> 1) * 9) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 9; ++k_inner_outer) {\n if (((((((int)blockIdx.x) & 1) * 12) + (((int)threadIdx.x) / 24)) + (k_inner_outer / 3)) < 23) {\n if (((((int)threadIdx.x) % 24) + (k_inner_outer % 3)) < 23) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[((((((((int)threadIdx.x) / 24) * 23) + ((k_inner_outer / 3) * 23)) + (((int)threadIdx.x) % 24)) + (k_inner_outer % 3)))];\n }\n }\n compute_shared_local[(0)] = compute_shared[(k_inner_outer)];\n if ((((((int)blockIdx.x) & 1) * 12) + (((int)threadIdx.x) / 24)) < 21) {\n if ((((int)threadIdx.x) % 24) < 21) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n }\n }\n }\n if ((((((int)blockIdx.x) & 1) * 12) + (((int)threadIdx.x) / 24)) < 21) {\n if ((((int)threadIdx.x) % 24) < 21) {\n compute[((((((((int)blockIdx.x) >> 1) * 441) + ((((int)blockIdx.x) & 1) * 252)) + ((((int)threadIdx.x) / 24) * 21)) + (((int)threadIdx.x) % 24)))] = DepthwiseConv2d_local[(0)];\n }\n }\n}\n", "gridDim": [86016, 1, 1], "blockDim": [288, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,21,21]_[5,5,336,1]_[128,336,21,21].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,21,21]_[5,5,336,1]_[128,336,21,21].json new file mode 100644 index 000000000..0b9ccb1e0 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,21,21]_[5,5,336,1]_[128,336,21,21].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 336, 21, 21], "filter_shape": [336, 1, 5, 5], "output_shape": [128, 336, 21, 21], "window_movement_strides": [1, 1], "padding_below_diff": [2, 2], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_336_21_21___5_5_336_1___128_336_21_21_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_336_21_21___5_5_336_1___128_336_21_21_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[1];\n __shared__ float PaddedInput_shared[700];\n __shared__ float compute_shared[100];\n float PaddedInput_shared_local[1];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = (((((2 <= (((((int)blockIdx.x) % 7) * 3) + ((((int)threadIdx.x) % 175) / 25))) && ((((((int)blockIdx.x) % 7) * 3) + ((((int)threadIdx.x) % 175) / 25)) < 23)) && (2 <= (((int)threadIdx.x) % 25))) && ((((int)threadIdx.x) % 25) < 23)) ? data[((((((((((int)blockIdx.x) / 7) * 1764) + ((((int)threadIdx.x) / 175) * 441)) + ((((int)blockIdx.x) % 7) * 63)) + (((((int)threadIdx.x) % 175) / 25) * 21)) + (((int)threadIdx.x) % 25)) - 44))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 288))] = (((((2 <= (((((int)blockIdx.x) % 7) * 3) + (((((int)threadIdx.x) + 113) % 175) / 25))) && ((((((int)blockIdx.x) % 7) * 3) + (((((int)threadIdx.x) + 113) % 175) / 25)) < 23)) && (2 <= ((((int)threadIdx.x) + 13) % 25))) && (((((int)threadIdx.x) + 13) % 25) < 23)) ? data[((((((((((int)blockIdx.x) / 7) * 1764) + (((((int)threadIdx.x) + 288) / 175) * 441)) + ((((int)blockIdx.x) % 7) * 63)) + ((((((int)threadIdx.x) + 113) % 175) / 25) * 21)) + ((((int)threadIdx.x) + 13) % 25)) - 44))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 124) {\n PaddedInput_shared[((((int)threadIdx.x) + 576))] = (((((((((int)blockIdx.x) % 7) * 3) + ((((int)threadIdx.x) + 51) / 25)) < 23) && (2 <= ((((int)threadIdx.x) + 1) % 25))) && (((((int)threadIdx.x) + 1) % 25) < 23)) ? data[((((((((((int)blockIdx.x) / 7) * 1764) + (((((int)threadIdx.x) + 576) / 175) * 441)) + ((((int)blockIdx.x) % 7) * 63)) + (((((int)threadIdx.x) + 51) / 25) * 21)) + ((((int)threadIdx.x) + 1) % 25)) - 44))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 100) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 588) / 7) * 100) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 25; ++k_inner_outer) {\n if (((((int)threadIdx.x) % 24) + (k_inner_outer % 5)) < 25) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[(((((((((int)threadIdx.x) / 72) * 175) + (((((int)threadIdx.x) % 72) / 24) * 25)) + ((k_inner_outer / 5) * 25)) + (((int)threadIdx.x) % 24)) + (k_inner_outer % 5)))];\n }\n compute_shared_local[(0)] = compute_shared[((((((int)threadIdx.x) / 72) * 25) + k_inner_outer))];\n if ((((int)threadIdx.x) % 24) < 21) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n }\n }\n if ((((int)threadIdx.x) % 24) < 21) {\n compute[(((((((((int)blockIdx.x) / 7) * 1764) + ((((int)threadIdx.x) / 72) * 441)) + ((((int)blockIdx.x) % 7) * 63)) + (((((int)threadIdx.x) % 72) / 24) * 21)) + (((int)threadIdx.x) % 24)))] = DepthwiseConv2d_local[(0)];\n }\n}\n", "gridDim": [75264, 1, 1], "blockDim": [288, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,21,21]_[7,7,336,1]_[128,336,21,21].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,21,21]_[7,7,336,1]_[128,336,21,21].json new file mode 100644 index 000000000..f90f2b27e --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,21,21]_[7,7,336,1]_[128,336,21,21].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 336, 21, 21], "filter_shape": [336, 1, 7, 7], "output_shape": [128, 336, 21, 21], "window_movement_strides": [1, 1], "padding_below_diff": [3, 3], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_336_21_21___7_7_336_1___128_336_21_21_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_336_21_21___7_7_336_1___128_336_21_21_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[1];\n __shared__ float PaddedInput_shared[972];\n __shared__ float compute_shared[98];\n float PaddedInput_shared_local[1];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = (((((3 <= (((((int)blockIdx.x) % 7) * 3) + ((((int)threadIdx.x) % 243) / 27))) && ((((((int)blockIdx.x) % 7) * 3) + ((((int)threadIdx.x) % 243) / 27)) < 24)) && (3 <= (((int)threadIdx.x) % 27))) && ((((int)threadIdx.x) % 27) < 24)) ? data[(((((((((((int)blockIdx.x) / 1176) * 296352) + (((((int)blockIdx.x) % 1176) / 7) * 882)) + ((((int)threadIdx.x) / 243) * 441)) + ((((int)blockIdx.x) % 7) * 63)) + (((((int)threadIdx.x) % 243) / 27) * 21)) + (((int)threadIdx.x) % 27)) - 66))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 288))] = (((((3 <= (((((int)blockIdx.x) % 7) * 3) + (((((int)threadIdx.x) + 45) % 243) / 27))) && ((((((int)blockIdx.x) % 7) * 3) + (((((int)threadIdx.x) + 45) % 243) / 27)) < 24)) && (3 <= ((((int)threadIdx.x) + 18) % 27))) && (((((int)threadIdx.x) + 18) % 27) < 24)) ? data[((((((((((((int)blockIdx.x) / 1176) * 296352) + (((((int)threadIdx.x) + 288) / 486) * 148176)) + (((((int)blockIdx.x) % 1176) / 7) * 882)) + ((((((int)threadIdx.x) + 288) % 486) / 243) * 441)) + ((((int)blockIdx.x) % 7) * 63)) + ((((((int)threadIdx.x) + 45) % 243) / 27) * 21)) + ((((int)threadIdx.x) + 18) % 27)) - 66))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 576))] = (((((3 <= (((((int)blockIdx.x) % 7) * 3) + (((((int)threadIdx.x) + 90) % 243) / 27))) && ((((((int)blockIdx.x) % 7) * 3) + (((((int)threadIdx.x) + 90) % 243) / 27)) < 24)) && (3 <= ((((int)threadIdx.x) + 9) % 27))) && (((((int)threadIdx.x) + 9) % 27) < 24)) ? data[((((((((((((int)blockIdx.x) / 1176) * 296352) + (((((int)threadIdx.x) + 576) / 486) * 148176)) + (((((int)blockIdx.x) % 1176) / 7) * 882)) + (((((int)threadIdx.x) + 90) / 243) * 441)) + ((((int)blockIdx.x) % 7) * 63)) + ((((((int)threadIdx.x) + 90) % 243) / 27) * 21)) + ((((int)threadIdx.x) + 9) % 27)) - 66))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 108) {\n PaddedInput_shared[((((int)threadIdx.x) + 864))] = (((((((((int)blockIdx.x) % 7) * 3) + ((((int)threadIdx.x) + 135) / 27)) < 24) && (3 <= (((int)threadIdx.x) % 27))) && ((((int)threadIdx.x) % 27) < 24)) ? data[((((((((((((int)blockIdx.x) / 1176) * 296352) + (((((int)threadIdx.x) + 864) / 486) * 148176)) + (((((int)blockIdx.x) % 1176) / 7) * 882)) + (((((int)threadIdx.x) + 378) / 243) * 441)) + ((((int)blockIdx.x) % 7) * 63)) + (((((int)threadIdx.x) + 135) / 27) * 21)) + (((int)threadIdx.x) % 27)) - 66))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 98) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 1176) / 7) * 98) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 49; ++k_inner_outer) {\n if (((((int)threadIdx.x) % 24) + (k_inner_outer % 7)) < 27) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[(((((((((int)threadIdx.x) / 72) * 243) + (((((int)threadIdx.x) % 72) / 24) * 27)) + ((k_inner_outer / 7) * 27)) + (((int)threadIdx.x) % 24)) + (k_inner_outer % 7)))];\n }\n compute_shared_local[(0)] = compute_shared[(((((((int)threadIdx.x) % 144) / 72) * 49) + k_inner_outer))];\n if ((((int)threadIdx.x) % 24) < 21) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n }\n }\n if ((((int)threadIdx.x) % 24) < 21) {\n compute[(((((((((((int)blockIdx.x) / 1176) * 296352) + ((((int)threadIdx.x) / 144) * 148176)) + (((((int)blockIdx.x) % 1176) / 7) * 882)) + (((((int)threadIdx.x) % 144) / 72) * 441)) + ((((int)blockIdx.x) % 7) * 63)) + (((((int)threadIdx.x) % 72) / 24) * 21)) + (((int)threadIdx.x) % 24)))] = DepthwiseConv2d_local[(0)];\n }\n}\n", "gridDim": [75264, 1, 1], "blockDim": [288, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,45,45]_[5,5,336,1]_[128,336,21,21].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,45,45]_[5,5,336,1]_[128,336,21,21].json new file mode 100644 index 000000000..6664a0362 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,45,45]_[5,5,336,1]_[128,336,21,21].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 336, 45, 45], "filter_shape": [336, 1, 5, 5], "output_shape": [128, 336, 21, 21], "window_movement_strides": [2, 2], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_336_45_45___5_5_336_1___128_336_21_21_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_336_45_45___5_5_336_1___128_336_21_21_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[1];\n __shared__ float PaddedInput_shared[1620];\n __shared__ float compute_shared[100];\n float PaddedInput_shared_local[1];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = data[(((((((int)blockIdx.x) / 7) * 8100) + ((((int)blockIdx.x) % 7) * 270)) + ((int)threadIdx.x)))];\n PaddedInput_shared[((((int)threadIdx.x) + 288))] = data[((((((((int)blockIdx.x) / 7) * 8100) + (((((int)threadIdx.x) + 288) / 405) * 2025)) + ((((int)blockIdx.x) % 7) * 270)) + ((((int)threadIdx.x) + 288) % 405)))];\n PaddedInput_shared[((((int)threadIdx.x) + 576))] = data[((((((((int)blockIdx.x) / 7) * 8100) + (((((int)threadIdx.x) + 576) / 405) * 2025)) + ((((int)blockIdx.x) % 7) * 270)) + ((((int)threadIdx.x) + 171) % 405)))];\n PaddedInput_shared[((((int)threadIdx.x) + 864))] = data[((((((((int)blockIdx.x) / 7) * 8100) + (((((int)threadIdx.x) + 864) / 405) * 2025)) + ((((int)blockIdx.x) % 7) * 270)) + (((int)threadIdx.x) + 54)))];\n PaddedInput_shared[((((int)threadIdx.x) + 1152))] = data[((((((((int)blockIdx.x) / 7) * 8100) + (((((int)threadIdx.x) + 1152) / 405) * 2025)) + ((((int)blockIdx.x) % 7) * 270)) + ((((int)threadIdx.x) + 342) % 405)))];\n if (((int)threadIdx.x) < 180) {\n PaddedInput_shared[((((int)threadIdx.x) + 1440))] = data[((((((((int)blockIdx.x) / 7) * 8100) + (((((int)threadIdx.x) + 1440) / 405) * 2025)) + ((((int)blockIdx.x) % 7) * 270)) + (((int)threadIdx.x) + 225)))];\n }\n if (((int)threadIdx.x) < 100) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 588) / 7) * 100) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 25; ++k_inner_outer) {\n if ((((((int)threadIdx.x) % 24) * 2) + (k_inner_outer % 5)) < 45) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[(((((((((int)threadIdx.x) / 72) * 405) + (((((int)threadIdx.x) % 72) / 24) * 90)) + ((k_inner_outer / 5) * 45)) + ((((int)threadIdx.x) % 24) * 2)) + (k_inner_outer % 5)))];\n }\n compute_shared_local[(0)] = compute_shared[((((((int)threadIdx.x) / 72) * 25) + k_inner_outer))];\n if ((((int)threadIdx.x) % 24) < 21) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n }\n }\n if ((((int)threadIdx.x) % 24) < 21) {\n compute[(((((((((int)blockIdx.x) / 7) * 1764) + ((((int)threadIdx.x) / 72) * 441)) + ((((int)blockIdx.x) % 7) * 63)) + (((((int)threadIdx.x) % 72) / 24) * 21)) + (((int)threadIdx.x) % 24)))] = DepthwiseConv2d_local[(0)];\n }\n}\n", "gridDim": [75264, 1, 1], "blockDim": [288, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,47,47]_[7,7,336,1]_[128,336,21,21].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,47,47]_[7,7,336,1]_[128,336,21,21].json new file mode 100644 index 000000000..3adde3e5e --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,336,47,47]_[7,7,336,1]_[128,336,21,21].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 336, 47, 47], "filter_shape": [336, 1, 7, 7], "output_shape": [128, 336, 21, 21], "window_movement_strides": [2, 2], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_336_47_47___7_7_336_1___128_336_21_21_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_336_47_47___7_7_336_1___128_336_21_21_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[1];\n __shared__ float PaddedInput_shared[1316];\n __shared__ float compute_shared[196];\n float PaddedInput_shared_local[1];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = data[(((((((int)blockIdx.x) / 21) * 8836) + ((((int)blockIdx.x) % 21) * 94)) + ((int)threadIdx.x)))];\n PaddedInput_shared[((((int)threadIdx.x) + 96))] = data[((((((((int)blockIdx.x) / 21) * 8836) + ((((int)blockIdx.x) % 21) * 94)) + ((int)threadIdx.x)) + 96))];\n PaddedInput_shared[((((int)threadIdx.x) + 192))] = data[((((((((int)blockIdx.x) / 21) * 8836) + ((((int)blockIdx.x) % 21) * 94)) + ((int)threadIdx.x)) + 192))];\n PaddedInput_shared[((((int)threadIdx.x) + 288))] = data[((((((((int)blockIdx.x) / 21) * 8836) + (((((int)threadIdx.x) + 288) / 329) * 2209)) + ((((int)blockIdx.x) % 21) * 94)) + ((((int)threadIdx.x) + 288) % 329)))];\n PaddedInput_shared[((((int)threadIdx.x) + 384))] = data[((((((((int)blockIdx.x) / 21) * 8836) + (((((int)threadIdx.x) + 384) / 329) * 2209)) + ((((int)blockIdx.x) % 21) * 94)) + (((int)threadIdx.x) + 55)))];\n PaddedInput_shared[((((int)threadIdx.x) + 480))] = data[((((((((int)blockIdx.x) / 21) * 8836) + (((((int)threadIdx.x) + 480) / 329) * 2209)) + ((((int)blockIdx.x) % 21) * 94)) + (((int)threadIdx.x) + 151)))];\n PaddedInput_shared[((((int)threadIdx.x) + 576))] = data[((((((((int)blockIdx.x) / 21) * 8836) + (((((int)threadIdx.x) + 576) / 329) * 2209)) + ((((int)blockIdx.x) % 21) * 94)) + ((((int)threadIdx.x) + 247) % 329)))];\n PaddedInput_shared[((((int)threadIdx.x) + 672))] = data[((((((((int)blockIdx.x) / 21) * 8836) + (((((int)threadIdx.x) + 672) / 329) * 2209)) + ((((int)blockIdx.x) % 21) * 94)) + (((int)threadIdx.x) + 14)))];\n PaddedInput_shared[((((int)threadIdx.x) + 768))] = data[((((((((int)blockIdx.x) / 21) * 8836) + (((((int)threadIdx.x) + 768) / 329) * 2209)) + ((((int)blockIdx.x) % 21) * 94)) + (((int)threadIdx.x) + 110)))];\n PaddedInput_shared[((((int)threadIdx.x) + 864))] = data[((((((((int)blockIdx.x) / 21) * 8836) + (((((int)threadIdx.x) + 864) / 329) * 2209)) + ((((int)blockIdx.x) % 21) * 94)) + (((int)threadIdx.x) + 206)))];\n PaddedInput_shared[((((int)threadIdx.x) + 960))] = data[((((((((int)blockIdx.x) / 21) * 8836) + (((((int)threadIdx.x) + 960) / 329) * 2209)) + ((((int)blockIdx.x) % 21) * 94)) + ((((int)threadIdx.x) + 302) % 329)))];\n PaddedInput_shared[((((int)threadIdx.x) + 1056))] = data[((((((((int)blockIdx.x) / 21) * 8836) + (((((int)threadIdx.x) + 1056) / 329) * 2209)) + ((((int)blockIdx.x) % 21) * 94)) + (((int)threadIdx.x) + 69)))];\n PaddedInput_shared[((((int)threadIdx.x) + 1152))] = data[((((((((int)blockIdx.x) / 21) * 8836) + (((((int)threadIdx.x) + 1152) / 329) * 2209)) + ((((int)blockIdx.x) % 21) * 94)) + (((int)threadIdx.x) + 165)))];\n if (((int)threadIdx.x) < 68) {\n PaddedInput_shared[((((int)threadIdx.x) + 1248))] = data[((((((((int)blockIdx.x) / 21) * 8836) + (((((int)threadIdx.x) + 1248) / 329) * 2209)) + ((((int)blockIdx.x) % 21) * 94)) + (((int)threadIdx.x) + 261)))];\n }\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 1764) / 21) * 196) + ((int)threadIdx.x)))];\n compute_shared[((((int)threadIdx.x) + 96))] = kernel[((((((((int)blockIdx.x) % 1764) / 21) * 196) + ((int)threadIdx.x)) + 96))];\n if (((int)threadIdx.x) < 4) {\n compute_shared[((((int)threadIdx.x) + 192))] = kernel[((((((((int)blockIdx.x) % 1764) / 21) * 196) + ((int)threadIdx.x)) + 192))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 49; ++k_inner_outer) {\n if ((((((int)threadIdx.x) % 24) * 2) + (k_inner_outer % 7)) < 47) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[((((((((int)threadIdx.x) / 24) * 329) + ((k_inner_outer / 7) * 47)) + ((((int)threadIdx.x) % 24) * 2)) + (k_inner_outer % 7)))];\n }\n compute_shared_local[(0)] = compute_shared[((((((int)threadIdx.x) / 24) * 49) + k_inner_outer))];\n if ((((int)threadIdx.x) % 24) < 21) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n }\n }\n if ((((int)threadIdx.x) % 24) < 21) {\n compute[((((((((int)blockIdx.x) / 21) * 1764) + ((((int)threadIdx.x) / 24) * 441)) + ((((int)blockIdx.x) % 21) * 21)) + (((int)threadIdx.x) % 24)))] = DepthwiseConv2d_local[(0)];\n }\n}\n", "gridDim": [225792, 1, 1], "blockDim": [96, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,42,165,165]_[5,5,42,1]_[128,42,83,83].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,42,165,165]_[5,5,42,1]_[128,42,83,83].json new file mode 100644 index 000000000..a2977b62e --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,42,165,165]_[5,5,42,1]_[128,42,83,83].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 42, 165, 165], "filter_shape": [42, 1, 5, 5], "output_shape": [128, 42, 83, 83], "window_movement_strides": [2, 2], "padding_below_diff": [2, 2], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_42_165_165___5_5_42_1___128_42_83_83_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_42_165_165___5_5_42_1___128_42_83_83_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[2];\n __shared__ float PaddedInput_shared[3211];\n __shared__ float compute_shared[25];\n float PaddedInput_shared_local[2];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n DepthwiseConv2d_local[(1)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = ((((2 <= (((((int)blockIdx.x) % 11) * 16) + (((int)threadIdx.x) / 169))) && (2 <= (((int)threadIdx.x) % 169))) && ((((int)threadIdx.x) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + ((((int)threadIdx.x) / 169) * 165)) + (((int)threadIdx.x) % 169)) - 332))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 352))] = (((2 <= ((((int)threadIdx.x) + 14) % 169)) && (((((int)threadIdx.x) + 14) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 352) / 169) * 165)) + ((((int)threadIdx.x) + 14) % 169)) - 332))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 704))] = (((2 <= ((((int)threadIdx.x) + 28) % 169)) && (((((int)threadIdx.x) + 28) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 704) / 169) * 165)) + ((((int)threadIdx.x) + 28) % 169)) - 332))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 1056))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 1056) / 169)) < 167) && (2 <= ((((int)threadIdx.x) + 42) % 169))) && (((((int)threadIdx.x) + 42) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 1056) / 169) * 165)) + ((((int)threadIdx.x) + 42) % 169)) - 332))] : 0.000000e+00f);\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 1408) / 169)) < 169) {\n PaddedInput_shared[((((int)threadIdx.x) + 1408))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 1408) / 169)) < 167) && (2 <= ((((int)threadIdx.x) + 56) % 169))) && (((((int)threadIdx.x) + 56) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 1408) / 169) * 165)) + ((((int)threadIdx.x) + 56) % 169)) - 332))] : 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 1760) / 169)) < 169) {\n PaddedInput_shared[((((int)threadIdx.x) + 1760))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 1760) / 169)) < 167) && (2 <= ((((int)threadIdx.x) + 70) % 169))) && (((((int)threadIdx.x) + 70) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 1760) / 169) * 165)) + ((((int)threadIdx.x) + 70) % 169)) - 332))] : 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2112) / 169)) < 169) {\n PaddedInput_shared[((((int)threadIdx.x) + 2112))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2112) / 169)) < 167) && (2 <= ((((int)threadIdx.x) + 84) % 169))) && (((((int)threadIdx.x) + 84) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 2112) / 169) * 165)) + ((((int)threadIdx.x) + 84) % 169)) - 332))] : 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2464) / 169)) < 169) {\n PaddedInput_shared[((((int)threadIdx.x) + 2464))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2464) / 169)) < 167) && (2 <= ((((int)threadIdx.x) + 98) % 169))) && (((((int)threadIdx.x) + 98) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 2464) / 169) * 165)) + ((((int)threadIdx.x) + 98) % 169)) - 332))] : 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2816) / 169)) < 169) {\n PaddedInput_shared[((((int)threadIdx.x) + 2816))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2816) / 169)) < 167) && (2 <= ((((int)threadIdx.x) + 112) % 169))) && (((((int)threadIdx.x) + 112) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 2816) / 169) * 165)) + ((((int)threadIdx.x) + 112) % 169)) - 332))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 43) {\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 3168) / 169)) < 169) {\n PaddedInput_shared[((((int)threadIdx.x) + 3168))] = ((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 3168) / 169)) < 167) && (((int)threadIdx.x) < 41)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 3168) / 169) * 165)) + (((int)threadIdx.x) + 126)) - 332))] : 0.000000e+00f);\n }\n }\n if (((int)threadIdx.x) < 25) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 462) / 11) * 25) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 25; ++k_inner_outer) {\n if (((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) / 44) * 2)) + (k_inner_outer / 5)) < 169) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[((((((((int)threadIdx.x) / 44) * 338) + ((k_inner_outer / 5) * 169)) + ((((int)threadIdx.x) % 44) * 2)) + (k_inner_outer % 5)))];\n if ((((((int)threadIdx.x) % 44) * 2) + (k_inner_outer % 5)) < 81) {\n PaddedInput_shared_local[(1)] = PaddedInput_shared[(((((((((int)threadIdx.x) / 44) * 338) + ((k_inner_outer / 5) * 169)) + ((((int)threadIdx.x) % 44) * 2)) + (k_inner_outer % 5)) + 88))];\n }\n }\n compute_shared_local[(0)] = compute_shared[(k_inner_outer)];\n if ((((((int)blockIdx.x) % 11) * 8) + (((int)threadIdx.x) / 44)) < 83) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n if ((((int)threadIdx.x) % 44) < 39) {\n DepthwiseConv2d_local[(1)] = (DepthwiseConv2d_local[(1)] + (PaddedInput_shared_local[(1)] * compute_shared_local[(0)]));\n }\n }\n }\n if ((((((int)blockIdx.x) % 11) * 8) + (((int)threadIdx.x) / 44)) < 83) {\n compute[((((((((int)blockIdx.x) / 11) * 6889) + ((((int)blockIdx.x) % 11) * 664)) + ((((int)threadIdx.x) / 44) * 83)) + (((int)threadIdx.x) % 44)))] = DepthwiseConv2d_local[(0)];\n if ((((int)threadIdx.x) % 44) < 39) {\n compute[(((((((((int)blockIdx.x) / 11) * 6889) + ((((int)blockIdx.x) % 11) * 664)) + ((((int)threadIdx.x) / 44) * 83)) + (((int)threadIdx.x) % 44)) + 44))] = DepthwiseConv2d_local[(1)];\n }\n }\n}\n", "gridDim": [59136, 1, 1], "blockDim": [352, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,42,83,83]_[3,3,42,1]_[128,42,83,83].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,42,83,83]_[3,3,42,1]_[128,42,83,83].json new file mode 100644 index 000000000..e094a6334 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,42,83,83]_[3,3,42,1]_[128,42,83,83].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 42, 83, 83], "filter_shape": [42, 1, 3, 3], "output_shape": [128, 42, 83, 83], "window_movement_strides": [1, 1], "padding_below_diff": [1, 1], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_42_83_83___3_3_42_1___128_42_83_83_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_42_83_83___3_3_42_1___128_42_83_83_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[2];\n __shared__ float PaddedInput_shared[680];\n __shared__ float compute_shared[9];\n float PaddedInput_shared_local[2];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n DepthwiseConv2d_local[(1)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = ((((1 <= (((((int)blockIdx.x) % 14) * 6) + (((int)threadIdx.x) / 85))) && (1 <= (((int)threadIdx.x) % 85))) && ((((int)threadIdx.x) % 85) < 84)) ? data[(((((((((int)blockIdx.x) / 14) * 6889) + ((((int)blockIdx.x) % 14) * 498)) + ((((int)threadIdx.x) / 85) * 83)) + (((int)threadIdx.x) % 85)) - 84))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 288))] = (((((((((int)blockIdx.x) % 14) * 6) + ((((int)threadIdx.x) + 288) / 85)) < 84) && (1 <= ((((int)threadIdx.x) + 33) % 85))) && (((((int)threadIdx.x) + 33) % 85) < 84)) ? data[(((((((((int)blockIdx.x) / 14) * 6889) + ((((int)blockIdx.x) % 14) * 498)) + (((((int)threadIdx.x) + 288) / 85) * 83)) + ((((int)threadIdx.x) + 33) % 85)) - 84))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 104) {\n if ((((((int)blockIdx.x) % 14) * 6) + ((((int)threadIdx.x) + 576) / 85)) < 85) {\n PaddedInput_shared[((((int)threadIdx.x) + 576))] = (((((((((int)blockIdx.x) % 14) * 6) + ((((int)threadIdx.x) + 576) / 85)) < 84) && (1 <= ((((int)threadIdx.x) + 66) % 85))) && (((((int)threadIdx.x) + 66) % 85) < 84)) ? data[(((((((((int)blockIdx.x) / 14) * 6889) + ((((int)blockIdx.x) % 14) * 498)) + (((((int)threadIdx.x) + 576) / 85) * 83)) + ((((int)threadIdx.x) + 66) % 85)) - 84))] : 0.000000e+00f);\n }\n }\n if (((int)threadIdx.x) < 9) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 588) / 14) * 9) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 9; ++k_inner_outer) {\n if (((((((int)blockIdx.x) % 14) * 6) + (((int)threadIdx.x) / 48)) + (k_inner_outer / 3)) < 85) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[((((((((int)threadIdx.x) / 48) * 85) + ((k_inner_outer / 3) * 85)) + (((int)threadIdx.x) % 48)) + (k_inner_outer % 3)))];\n if (((((int)threadIdx.x) % 48) + (k_inner_outer % 3)) < 37) {\n PaddedInput_shared_local[(1)] = PaddedInput_shared[(((((((((int)threadIdx.x) / 48) * 85) + ((k_inner_outer / 3) * 85)) + (((int)threadIdx.x) % 48)) + (k_inner_outer % 3)) + 48))];\n }\n }\n compute_shared_local[(0)] = compute_shared[(k_inner_outer)];\n if ((((((int)blockIdx.x) % 14) * 6) + (((int)threadIdx.x) / 48)) < 83) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n if ((((int)threadIdx.x) % 48) < 35) {\n DepthwiseConv2d_local[(1)] = (DepthwiseConv2d_local[(1)] + (PaddedInput_shared_local[(1)] * compute_shared_local[(0)]));\n }\n }\n }\n if ((((((int)blockIdx.x) % 14) * 6) + (((int)threadIdx.x) / 48)) < 83) {\n compute[((((((((int)blockIdx.x) / 14) * 6889) + ((((int)blockIdx.x) % 14) * 498)) + ((((int)threadIdx.x) / 48) * 83)) + (((int)threadIdx.x) % 48)))] = DepthwiseConv2d_local[(0)];\n if ((((int)threadIdx.x) % 48) < 35) {\n compute[(((((((((int)blockIdx.x) / 14) * 6889) + ((((int)blockIdx.x) % 14) * 498)) + ((((int)threadIdx.x) / 48) * 83)) + (((int)threadIdx.x) % 48)) + 48))] = DepthwiseConv2d_local[(1)];\n }\n }\n}\n", "gridDim": [75264, 1, 1], "blockDim": [288, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,42,83,83]_[5,5,42,1]_[128,42,83,83].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,42,83,83]_[5,5,42,1]_[128,42,83,83].json new file mode 100644 index 000000000..0e4c13c45 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,42,83,83]_[5,5,42,1]_[128,42,83,83].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 42, 83, 83], "filter_shape": [42, 1, 5, 5], "output_shape": [128, 42, 83, 83], "window_movement_strides": [1, 1], "padding_below_diff": [2, 2], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_42_83_83___5_5_42_1___128_42_83_83_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_42_83_83___5_5_42_1___128_42_83_83_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[4];\n __shared__ float PaddedInput_shared[1700];\n __shared__ float compute_shared[18];\n float PaddedInput_shared_local[4];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n DepthwiseConv2d_local[(1)] = 0.000000e+00f;\n DepthwiseConv2d_local[(2)] = 0.000000e+00f;\n DepthwiseConv2d_local[(3)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = (((((1 <= (((((int)blockIdx.x) % 11) * 8) + (((int)threadIdx.x) / 85))) && ((((((int)blockIdx.x) % 11) * 8) + (((int)threadIdx.x) / 85)) < 84)) && (1 <= (((int)threadIdx.x) % 85))) && ((((int)threadIdx.x) % 85) < 84)) ? data[(((((((((int)blockIdx.x) / 11) * 13778) + ((((int)blockIdx.x) % 11) * 664)) + ((((int)threadIdx.x) / 85) * 83)) + (((int)threadIdx.x) % 85)) - 84))] : 0.000000e+00f);\n if ((((((int)blockIdx.x) % 11) * 8) + ((((int)threadIdx.x) + 352) / 85)) < 85) {\n PaddedInput_shared[((((int)threadIdx.x) + 352))] = (((((((((int)blockIdx.x) % 11) * 8) + ((((int)threadIdx.x) + 352) / 85)) < 84) && (1 <= ((((int)threadIdx.x) + 12) % 85))) && (((((int)threadIdx.x) + 12) % 85) < 84)) ? data[(((((((((int)blockIdx.x) / 11) * 13778) + ((((int)blockIdx.x) % 11) * 664)) + (((((int)threadIdx.x) + 352) / 85) * 83)) + ((((int)threadIdx.x) + 12) % 85)) - 84))] : 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) % 11) * 8) + (((((int)threadIdx.x) + 704) % 850) / 85)) < 85) {\n PaddedInput_shared[((((int)threadIdx.x) + 704))] = (((((1 <= (((((int)blockIdx.x) % 11) * 8) + (((((int)threadIdx.x) + 704) % 850) / 85))) && ((((((int)blockIdx.x) % 11) * 8) + (((((int)threadIdx.x) + 704) % 850) / 85)) < 84)) && (1 <= ((((int)threadIdx.x) + 24) % 85))) && (((((int)threadIdx.x) + 24) % 85) < 84)) ? data[((((((((((int)blockIdx.x) / 11) * 13778) + (((((int)threadIdx.x) + 704) / 850) * 6889)) + ((((int)blockIdx.x) % 11) * 664)) + ((((((int)threadIdx.x) + 704) % 850) / 85) * 83)) + ((((int)threadIdx.x) + 24) % 85)) - 84))] : 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) % 11) * 8) + ((((int)threadIdx.x) + 206) / 85)) < 85) {\n PaddedInput_shared[((((int)threadIdx.x) + 1056))] = (((((((((int)blockIdx.x) % 11) * 8) + ((((int)threadIdx.x) + 206) / 85)) < 84) && (1 <= ((((int)threadIdx.x) + 36) % 85))) && (((((int)threadIdx.x) + 36) % 85) < 84)) ? data[((((((((((int)blockIdx.x) / 11) * 13778) + (((((int)threadIdx.x) + 1056) / 850) * 6889)) + ((((int)blockIdx.x) % 11) * 664)) + (((((int)threadIdx.x) + 206) / 85) * 83)) + ((((int)threadIdx.x) + 36) % 85)) - 84))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 292) {\n if ((((((int)blockIdx.x) % 11) * 8) + ((((int)threadIdx.x) + 558) / 85)) < 85) {\n PaddedInput_shared[((((int)threadIdx.x) + 1408))] = (((((((((int)blockIdx.x) % 11) * 8) + ((((int)threadIdx.x) + 558) / 85)) < 84) && (1 <= ((((int)threadIdx.x) + 48) % 85))) && (((((int)threadIdx.x) + 48) % 85) < 84)) ? data[((((((((((int)blockIdx.x) / 11) * 13778) + (((((int)threadIdx.x) + 1408) / 850) * 6889)) + ((((int)blockIdx.x) % 11) * 664)) + (((((int)threadIdx.x) + 558) / 85) * 83)) + ((((int)threadIdx.x) + 48) % 85)) - 84))] : 0.000000e+00f);\n }\n }\n if (((int)threadIdx.x) < 18) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 231) / 11) * 18) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 9; ++k_inner_outer) {\n if (((((((int)blockIdx.x) % 11) * 8) + ((((int)threadIdx.x) % 176) / 22)) + (k_inner_outer / 3)) < 85) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[(((((((((int)threadIdx.x) / 176) * 850) + (((((int)threadIdx.x) % 176) / 22) * 85)) + ((k_inner_outer / 3) * 85)) + (((int)threadIdx.x) % 22)) + (k_inner_outer % 3)))];\n PaddedInput_shared_local[(1)] = PaddedInput_shared[((((((((((int)threadIdx.x) / 176) * 850) + (((((int)threadIdx.x) % 176) / 22) * 85)) + ((k_inner_outer / 3) * 85)) + (((int)threadIdx.x) % 22)) + (k_inner_outer % 3)) + 22))];\n PaddedInput_shared_local[(2)] = PaddedInput_shared[((((((((((int)threadIdx.x) / 176) * 850) + (((((int)threadIdx.x) % 176) / 22) * 85)) + ((k_inner_outer / 3) * 85)) + (((int)threadIdx.x) % 22)) + (k_inner_outer % 3)) + 44))];\n if (((((int)threadIdx.x) % 22) + (k_inner_outer % 3)) < 19) {\n PaddedInput_shared_local[(3)] = PaddedInput_shared[((((((((((int)threadIdx.x) / 176) * 850) + (((((int)threadIdx.x) % 176) / 22) * 85)) + ((k_inner_outer / 3) * 85)) + (((int)threadIdx.x) % 22)) + (k_inner_outer % 3)) + 66))];\n }\n }\n compute_shared_local[(0)] = compute_shared[((((((int)threadIdx.x) / 176) * 9) + k_inner_outer))];\n if ((((((int)blockIdx.x) % 11) * 8) + ((((int)threadIdx.x) % 176) / 22)) < 83) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n DepthwiseConv2d_local[(1)] = (DepthwiseConv2d_local[(1)] + (PaddedInput_shared_local[(1)] * compute_shared_local[(0)]));\n DepthwiseConv2d_local[(2)] = (DepthwiseConv2d_local[(2)] + (PaddedInput_shared_local[(2)] * compute_shared_local[(0)]));\n if ((((int)threadIdx.x) % 22) < 17) {\n DepthwiseConv2d_local[(3)] = (DepthwiseConv2d_local[(3)] + (PaddedInput_shared_local[(3)] * compute_shared_local[(0)]));\n }\n }\n }\n if ((((((int)blockIdx.x) % 11) * 8) + ((((int)threadIdx.x) % 176) / 22)) < 83) {\n compute[(((((((((int)blockIdx.x) / 11) * 13778) + ((((int)threadIdx.x) / 176) * 6889)) + ((((int)blockIdx.x) % 11) * 664)) + (((((int)threadIdx.x) % 176) / 22) * 83)) + (((int)threadIdx.x) % 22)))] = DepthwiseConv2d_local[(0)];\n compute[((((((((((int)blockIdx.x) / 11) * 13778) + ((((int)threadIdx.x) / 176) * 6889)) + ((((int)blockIdx.x) % 11) * 664)) + (((((int)threadIdx.x) % 176) / 22) * 83)) + (((int)threadIdx.x) % 22)) + 22))] = DepthwiseConv2d_local[(1)];\n compute[((((((((((int)blockIdx.x) / 11) * 13778) + ((((int)threadIdx.x) / 176) * 6889)) + ((((int)blockIdx.x) % 11) * 664)) + (((((int)threadIdx.x) % 176) / 22) * 83)) + (((int)threadIdx.x) % 22)) + 44))] = DepthwiseConv2d_local[(2)];\n if ((((int)threadIdx.x) % 22) < 17) {\n compute[((((((((((int)blockIdx.x) / 11) * 13778) + ((((int)threadIdx.x) / 176) * 6889)) + ((((int)blockIdx.x) % 11) * 664)) + (((((int)threadIdx.x) % 176) / 22) * 83)) + (((int)threadIdx.x) % 22)) + 66))] = DepthwiseConv2d_local[(3)];\n }\n }\n}\n", "gridDim": [29568, 1, 1], "blockDim": [352, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,42,83,83]_[7,7,42,1]_[128,42,83,83].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,42,83,83]_[7,7,42,1]_[128,42,83,83].json new file mode 100644 index 000000000..759a25093 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,42,83,83]_[7,7,42,1]_[128,42,83,83].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 42, 83, 83], "filter_shape": [42, 1, 7, 7], "output_shape": [128, 42, 83, 83], "window_movement_strides": [1, 1], "padding_below_diff": [3, 3], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_42_83_83___7_7_42_1___128_42_83_83_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_42_83_83___7_7_42_1___128_42_83_83_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[2];\n __shared__ float PaddedInput_shared[1246];\n __shared__ float compute_shared[49];\n float PaddedInput_shared_local[2];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n DepthwiseConv2d_local[(1)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = ((((3 <= (((((int)blockIdx.x) % 11) * 8) + (((int)threadIdx.x) / 89))) && (3 <= (((int)threadIdx.x) % 89))) && ((((int)threadIdx.x) % 89) < 86)) ? data[(((((((((int)blockIdx.x) / 11) * 6889) + ((((int)blockIdx.x) % 11) * 664)) + ((((int)threadIdx.x) / 89) * 83)) + (((int)threadIdx.x) % 89)) - 252))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 352))] = (((((((((int)blockIdx.x) % 11) * 8) + ((((int)threadIdx.x) + 352) / 89)) < 86) && (3 <= ((((int)threadIdx.x) + 85) % 89))) && (((((int)threadIdx.x) + 85) % 89) < 86)) ? data[(((((((((int)blockIdx.x) / 11) * 6889) + ((((int)blockIdx.x) % 11) * 664)) + (((((int)threadIdx.x) + 352) / 89) * 83)) + ((((int)threadIdx.x) + 85) % 89)) - 252))] : 0.000000e+00f);\n if ((((((int)blockIdx.x) % 11) * 8) + ((((int)threadIdx.x) + 704) / 89)) < 89) {\n PaddedInput_shared[((((int)threadIdx.x) + 704))] = (((((((((int)blockIdx.x) % 11) * 8) + ((((int)threadIdx.x) + 704) / 89)) < 86) && (3 <= ((((int)threadIdx.x) + 81) % 89))) && (((((int)threadIdx.x) + 81) % 89) < 86)) ? data[(((((((((int)blockIdx.x) / 11) * 6889) + ((((int)blockIdx.x) % 11) * 664)) + (((((int)threadIdx.x) + 704) / 89) * 83)) + ((((int)threadIdx.x) + 81) % 89)) - 252))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 190) {\n if ((((((int)blockIdx.x) % 11) * 8) + ((((int)threadIdx.x) + 1056) / 89)) < 89) {\n PaddedInput_shared[((((int)threadIdx.x) + 1056))] = (((((((((int)blockIdx.x) % 11) * 8) + ((((int)threadIdx.x) + 1056) / 89)) < 86) && (3 <= ((((int)threadIdx.x) + 77) % 89))) && (((((int)threadIdx.x) + 77) % 89) < 86)) ? data[(((((((((int)blockIdx.x) / 11) * 6889) + ((((int)blockIdx.x) % 11) * 664)) + (((((int)threadIdx.x) + 1056) / 89) * 83)) + ((((int)threadIdx.x) + 77) % 89)) - 252))] : 0.000000e+00f);\n }\n }\n if (((int)threadIdx.x) < 49) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 462) / 11) * 49) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 49; ++k_inner_outer) {\n if (((((((int)blockIdx.x) % 11) * 8) + (((int)threadIdx.x) / 44)) + (k_inner_outer / 7)) < 89) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[((((((((int)threadIdx.x) / 44) * 89) + ((k_inner_outer / 7) * 89)) + (((int)threadIdx.x) % 44)) + (k_inner_outer % 7)))];\n if (((((int)threadIdx.x) % 44) + (k_inner_outer % 7)) < 45) {\n PaddedInput_shared_local[(1)] = PaddedInput_shared[(((((((((int)threadIdx.x) / 44) * 89) + ((k_inner_outer / 7) * 89)) + (((int)threadIdx.x) % 44)) + (k_inner_outer % 7)) + 44))];\n }\n }\n compute_shared_local[(0)] = compute_shared[(k_inner_outer)];\n if ((((((int)blockIdx.x) % 11) * 8) + (((int)threadIdx.x) / 44)) < 83) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n if ((((int)threadIdx.x) % 44) < 39) {\n DepthwiseConv2d_local[(1)] = (DepthwiseConv2d_local[(1)] + (PaddedInput_shared_local[(1)] * compute_shared_local[(0)]));\n }\n }\n }\n if ((((((int)blockIdx.x) % 11) * 8) + (((int)threadIdx.x) / 44)) < 83) {\n compute[((((((((int)blockIdx.x) / 11) * 6889) + ((((int)blockIdx.x) % 11) * 664)) + ((((int)threadIdx.x) / 44) * 83)) + (((int)threadIdx.x) % 44)))] = DepthwiseConv2d_local[(0)];\n if ((((int)threadIdx.x) % 44) < 39) {\n compute[(((((((((int)blockIdx.x) / 11) * 6889) + ((((int)blockIdx.x) % 11) * 664)) + ((((int)threadIdx.x) / 44) * 83)) + (((int)threadIdx.x) % 44)) + 44))] = DepthwiseConv2d_local[(1)];\n }\n }\n}\n", "gridDim": [59136, 1, 1], "blockDim": [352, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,11,11]_[3,3,672,1]_[128,672,11,11].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,11,11]_[3,3,672,1]_[128,672,11,11].json new file mode 100644 index 000000000..ce3bcce1d --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,11,11]_[3,3,672,1]_[128,672,11,11].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 672, 11, 11], "filter_shape": [672, 1, 3, 3], "output_shape": [128, 672, 11, 11], "window_movement_strides": [1, 1], "padding_below_diff": [1, 1], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_672_11_11___3_3_672_1___128_672_11_11_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_672_11_11___3_3_672_1___128_672_11_11_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[1];\n __shared__ float PaddedInput_shared[338];\n __shared__ float compute_shared[18];\n float PaddedInput_shared_local[1];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n if (((int)threadIdx.x) < 338) {\n PaddedInput_shared[(((int)threadIdx.x))] = (((((13 <= (((int)threadIdx.x) % 169)) && ((((int)threadIdx.x) % 169) < 156)) && (1 <= (((int)threadIdx.x) % 13))) && ((((int)threadIdx.x) % 13) < 12)) ? data[((((((((int)blockIdx.x) * 242) + ((((int)threadIdx.x) / 169) * 121)) + (((((int)threadIdx.x) % 169) / 13) * 11)) + (((int)threadIdx.x) % 13)) - 12))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 18) {\n compute_shared[(((int)threadIdx.x))] = kernel[((((((int)blockIdx.x) % 336) * 18) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 9; ++k_inner_outer) {\n if (((((int)threadIdx.x) & 15) + (k_inner_outer % 3)) < 13) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[(((((((((int)threadIdx.x) / 176) * 169) + (((((int)threadIdx.x) % 176) >> 4) * 13)) + ((k_inner_outer / 3) * 13)) + (((int)threadIdx.x) & 15)) + (k_inner_outer % 3)))];\n }\n compute_shared_local[(0)] = compute_shared[((((((int)threadIdx.x) / 176) * 9) + k_inner_outer))];\n if ((((int)threadIdx.x) & 15) < 11) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n }\n }\n if ((((int)threadIdx.x) & 15) < 11) {\n compute[((((((int)blockIdx.x) * 242) + ((((int)threadIdx.x) >> 4) * 11)) + (((int)threadIdx.x) & 15)))] = DepthwiseConv2d_local[(0)];\n }\n}\n", "gridDim": [43008, 1, 1], "blockDim": [352, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,11,11]_[5,5,672,1]_[128,672,11,11].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,11,11]_[5,5,672,1]_[128,672,11,11].json new file mode 100644 index 000000000..8f0dcbc23 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,11,11]_[5,5,672,1]_[128,672,11,11].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 672, 11, 11], "filter_shape": [672, 1, 5, 5], "output_shape": [128, 672, 11, 11], "window_movement_strides": [1, 1], "padding_below_diff": [2, 2], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_672_11_11___5_5_672_1___128_672_11_11_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_672_11_11___5_5_672_1___128_672_11_11_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[1];\n __shared__ float PaddedInput_shared[450];\n __shared__ float compute_shared[25];\n float PaddedInput_shared_local[1];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = (((((30 <= (((int)threadIdx.x) % 225)) && ((((int)threadIdx.x) % 225) < 195)) && (2 <= (((int)threadIdx.x) % 15))) && ((((int)threadIdx.x) % 15) < 13)) ? data[((((((((((int)blockIdx.x) / 672) * 162624) + ((((int)threadIdx.x) / 225) * 81312)) + ((((int)blockIdx.x) % 672) * 121)) + (((((int)threadIdx.x) % 225) / 15) * 11)) + (((int)threadIdx.x) % 15)) - 24))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 98) {\n PaddedInput_shared[((((int)threadIdx.x) + 352))] = ((((((int)threadIdx.x) < 68) && (2 <= ((((int)threadIdx.x) + 7) % 15))) && (((((int)threadIdx.x) + 7) % 15) < 13)) ? data[((((((((((int)blockIdx.x) / 672) * 162624) + (((((int)threadIdx.x) + 352) / 225) * 81312)) + ((((int)blockIdx.x) % 672) * 121)) + (((((int)threadIdx.x) + 127) / 15) * 11)) + ((((int)threadIdx.x) + 7) % 15)) - 24))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 25) {\n compute_shared[(((int)threadIdx.x))] = kernel[((((((int)blockIdx.x) % 672) * 25) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 25; ++k_inner_outer) {\n if (((((int)threadIdx.x) & 15) + (k_inner_outer % 5)) < 15) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[(((((((((int)threadIdx.x) / 176) * 225) + (((((int)threadIdx.x) % 176) >> 4) * 15)) + ((k_inner_outer / 5) * 15)) + (((int)threadIdx.x) & 15)) + (k_inner_outer % 5)))];\n }\n compute_shared_local[(0)] = compute_shared[(k_inner_outer)];\n if ((((int)threadIdx.x) & 15) < 11) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n }\n }\n if ((((int)threadIdx.x) & 15) < 11) {\n compute[(((((((((int)blockIdx.x) / 672) * 162624) + ((((int)threadIdx.x) / 176) * 81312)) + ((((int)blockIdx.x) % 672) * 121)) + (((((int)threadIdx.x) % 176) >> 4) * 11)) + (((int)threadIdx.x) & 15)))] = DepthwiseConv2d_local[(0)];\n }\n}\n", "gridDim": [43008, 1, 1], "blockDim": [352, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,11,11]_[7,7,672,1]_[128,672,11,11].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,11,11]_[7,7,672,1]_[128,672,11,11].json new file mode 100644 index 000000000..7b83354ed --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,11,11]_[7,7,672,1]_[128,672,11,11].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 672, 11, 11], "filter_shape": [672, 1, 7, 7], "output_shape": [128, 672, 11, 11], "window_movement_strides": [1, 1], "padding_below_diff": [3, 3], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_672_11_11___7_7_672_1___128_672_11_11_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_672_11_11___7_7_672_1___128_672_11_11_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[1];\n __shared__ float PaddedInput_shared[578];\n __shared__ float compute_shared[49];\n float PaddedInput_shared_local[1];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = (((((51 <= (((int)threadIdx.x) % 289)) && ((((int)threadIdx.x) % 289) < 238)) && (3 <= (((int)threadIdx.x) % 17))) && ((((int)threadIdx.x) % 17) < 14)) ? data[((((((((((int)blockIdx.x) / 672) * 162624) + ((((int)threadIdx.x) / 289) * 81312)) + ((((int)blockIdx.x) % 672) * 121)) + (((((int)threadIdx.x) % 289) / 17) * 11)) + (((int)threadIdx.x) % 17)) - 36))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 226) {\n PaddedInput_shared[((((int)threadIdx.x) + 352))] = ((((((int)threadIdx.x) < 175) && (3 <= ((((int)threadIdx.x) + 12) % 17))) && (((((int)threadIdx.x) + 12) % 17) < 14)) ? data[((((((((((int)blockIdx.x) / 672) * 162624) + (((((int)threadIdx.x) + 352) / 289) * 81312)) + ((((int)blockIdx.x) % 672) * 121)) + (((((int)threadIdx.x) + 63) / 17) * 11)) + ((((int)threadIdx.x) + 12) % 17)) - 36))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 49) {\n compute_shared[(((int)threadIdx.x))] = kernel[((((((int)blockIdx.x) % 672) * 49) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 49; ++k_inner_outer) {\n if (((((int)threadIdx.x) & 15) + (k_inner_outer % 7)) < 17) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[(((((((((int)threadIdx.x) / 176) * 289) + (((((int)threadIdx.x) % 176) >> 4) * 17)) + ((k_inner_outer / 7) * 17)) + (((int)threadIdx.x) & 15)) + (k_inner_outer % 7)))];\n }\n compute_shared_local[(0)] = compute_shared[(k_inner_outer)];\n if ((((int)threadIdx.x) & 15) < 11) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n }\n }\n if ((((int)threadIdx.x) & 15) < 11) {\n compute[(((((((((int)blockIdx.x) / 672) * 162624) + ((((int)threadIdx.x) / 176) * 81312)) + ((((int)blockIdx.x) % 672) * 121)) + (((((int)threadIdx.x) % 176) >> 4) * 11)) + (((int)threadIdx.x) & 15)))] = DepthwiseConv2d_local[(0)];\n }\n}\n", "gridDim": [43008, 1, 1], "blockDim": [352, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,21,21]_[5,5,672,1]_[128,672,11,11].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,21,21]_[5,5,672,1]_[128,672,11,11].json new file mode 100644 index 000000000..90ad01faa --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,21,21]_[5,5,672,1]_[128,672,11,11].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 672, 21, 21], "filter_shape": [672, 1, 5, 5], "output_shape": [128, 672, 11, 11], "window_movement_strides": [2, 2], "padding_below_diff": [2, 2], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_672_21_21___5_5_672_1___128_672_11_11_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_672_21_21___5_5_672_1___128_672_11_11_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[1];\n __shared__ float PaddedInput_shared[1250];\n __shared__ float compute_shared[50];\n float PaddedInput_shared_local[1];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = ((((50 <= ((int)threadIdx.x)) && (2 <= (((int)threadIdx.x) % 25))) && ((((int)threadIdx.x) % 25) < 23)) ? data[(((((((int)blockIdx.x) * 882) + ((((int)threadIdx.x) / 25) * 21)) + (((int)threadIdx.x) % 25)) - 44))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 352))] = (((((50 <= ((((int)threadIdx.x) + 352) % 625)) && (((((int)threadIdx.x) + 352) % 625) < 575)) && (2 <= ((((int)threadIdx.x) + 2) % 25))) && (((((int)threadIdx.x) + 2) % 25) < 23)) ? data[((((((((int)blockIdx.x) * 882) + (((((int)threadIdx.x) + 352) / 625) * 441)) + ((((((int)threadIdx.x) + 352) % 625) / 25) * 21)) + ((((int)threadIdx.x) + 2) % 25)) - 44))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 704))] = (((2 <= ((((int)threadIdx.x) + 4) % 25)) && (((((int)threadIdx.x) + 4) % 25) < 23)) ? data[((((((((int)blockIdx.x) * 882) + (((((int)threadIdx.x) + 704) / 625) * 441)) + (((((int)threadIdx.x) + 79) / 25) * 21)) + ((((int)threadIdx.x) + 4) % 25)) - 44))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 194) {\n PaddedInput_shared[((((int)threadIdx.x) + 1056))] = ((((((int)threadIdx.x) < 144) && (2 <= ((((int)threadIdx.x) + 6) % 25))) && (((((int)threadIdx.x) + 6) % 25) < 23)) ? data[((((((((int)blockIdx.x) * 882) + (((((int)threadIdx.x) + 1056) / 625) * 441)) + (((((int)threadIdx.x) + 431) / 25) * 21)) + ((((int)threadIdx.x) + 6) % 25)) - 44))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 50) {\n compute_shared[(((int)threadIdx.x))] = kernel[((((((int)blockIdx.x) % 336) * 50) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 25; ++k_inner_outer) {\n if ((((((int)threadIdx.x) & 15) * 2) + (k_inner_outer % 5)) < 25) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[(((((((((int)threadIdx.x) / 176) * 625) + (((((int)threadIdx.x) % 176) >> 4) * 50)) + ((k_inner_outer / 5) * 25)) + ((((int)threadIdx.x) & 15) * 2)) + (k_inner_outer % 5)))];\n }\n compute_shared_local[(0)] = compute_shared[((((((int)threadIdx.x) / 176) * 25) + k_inner_outer))];\n if ((((int)threadIdx.x) & 15) < 11) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n }\n }\n if ((((int)threadIdx.x) & 15) < 11) {\n compute[((((((int)blockIdx.x) * 242) + ((((int)threadIdx.x) >> 4) * 11)) + (((int)threadIdx.x) & 15)))] = DepthwiseConv2d_local[(0)];\n }\n}\n", "gridDim": [43008, 1, 1], "blockDim": [352, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,21,21]_[7,7,672,1]_[128,672,11,11].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,21,21]_[7,7,672,1]_[128,672,11,11].json new file mode 100644 index 000000000..041532bc4 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,672,21,21]_[7,7,672,1]_[128,672,11,11].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 672, 21, 21], "filter_shape": [672, 1, 7, 7], "output_shape": [128, 672, 11, 11], "window_movement_strides": [2, 2], "padding_below_diff": [3, 3], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_672_21_21___7_7_672_1___128_672_11_11_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_672_21_21___7_7_672_1___128_672_11_11_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[1];\n __shared__ float PaddedInput_shared[1458];\n __shared__ float compute_shared[98];\n float PaddedInput_shared_local[1];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = ((((81 <= ((int)threadIdx.x)) && (3 <= (((int)threadIdx.x) % 27))) && ((((int)threadIdx.x) % 27) < 24)) ? data[(((((((int)blockIdx.x) * 882) + ((((int)threadIdx.x) / 27) * 21)) + (((int)threadIdx.x) % 27)) - 66))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 352))] = ((((((int)threadIdx.x) < 296) && (3 <= ((((int)threadIdx.x) + 1) % 27))) && (((((int)threadIdx.x) + 1) % 27) < 24)) ? data[(((((((int)blockIdx.x) * 882) + (((((int)threadIdx.x) + 352) / 27) * 21)) + ((((int)threadIdx.x) + 1) % 27)) - 66))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 704))] = (((((81 <= ((((int)threadIdx.x) + 704) % 729)) && (((((int)threadIdx.x) + 704) % 729) < 648)) && (3 <= ((((int)threadIdx.x) + 2) % 27))) && (((((int)threadIdx.x) + 2) % 27) < 24)) ? data[((((((((int)blockIdx.x) * 882) + (((((int)threadIdx.x) + 704) / 729) * 441)) + ((((((int)threadIdx.x) + 704) % 729) / 27) * 21)) + ((((int)threadIdx.x) + 2) % 27)) - 66))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 1056))] = ((((((int)threadIdx.x) < 321) && (3 <= ((((int)threadIdx.x) + 3) % 27))) && (((((int)threadIdx.x) + 3) % 27) < 24)) ? data[((((((((int)blockIdx.x) * 882) + (((((int)threadIdx.x) + 1056) / 729) * 441)) + (((((int)threadIdx.x) + 327) / 27) * 21)) + ((((int)threadIdx.x) + 3) % 27)) - 66))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 50) {\n PaddedInput_shared[((((int)threadIdx.x) + 1408))] = 0.000000e+00f;\n }\n if (((int)threadIdx.x) < 98) {\n compute_shared[(((int)threadIdx.x))] = kernel[((((((int)blockIdx.x) % 336) * 98) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 49; ++k_inner_outer) {\n if ((((((int)threadIdx.x) & 15) * 2) + (k_inner_outer % 7)) < 27) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[(((((((((int)threadIdx.x) / 176) * 729) + (((((int)threadIdx.x) % 176) >> 4) * 54)) + ((k_inner_outer / 7) * 27)) + ((((int)threadIdx.x) & 15) * 2)) + (k_inner_outer % 7)))];\n }\n compute_shared_local[(0)] = compute_shared[((((((int)threadIdx.x) / 176) * 49) + k_inner_outer))];\n if ((((int)threadIdx.x) & 15) < 11) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n }\n }\n if ((((int)threadIdx.x) & 15) < 11) {\n compute[((((((int)blockIdx.x) * 242) + ((((int)threadIdx.x) >> 4) * 11)) + (((int)threadIdx.x) & 15)))] = DepthwiseConv2d_local[(0)];\n }\n}\n", "gridDim": [43008, 1, 1], "blockDim": [352, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,42,42]_[3,3,84,1]_[128,84,42,42].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,42,42]_[3,3,84,1]_[128,84,42,42].json new file mode 100644 index 000000000..48af431d0 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,42,42]_[3,3,84,1]_[128,84,42,42].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 84, 42, 42], "filter_shape": [84, 1, 3, 3], "output_shape": [128, 84, 42, 42], "window_movement_strides": [1, 1], "padding_below_diff": [1, 1], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_84_42_42___3_3_84_1___128_84_42_42_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_84_42_42___3_3_84_1___128_84_42_42_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[2];\n __shared__ float PaddedInput_shared[264];\n __shared__ float compute_shared[9];\n float PaddedInput_shared_local[2];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n DepthwiseConv2d_local[(1)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = ((((1 <= (((((int)blockIdx.x) % 11) * 4) + (((int)threadIdx.x) / 44))) && (1 <= (((int)threadIdx.x) % 44))) && ((((int)threadIdx.x) % 44) < 43)) ? data[(((((((((int)blockIdx.x) / 11) * 1764) + ((((int)blockIdx.x) % 11) * 168)) + ((((int)threadIdx.x) / 44) * 42)) + (((int)threadIdx.x) % 44)) - 43))] : 0.000000e+00f);\n if ((((((int)blockIdx.x) % 11) * 4) + ((((int)threadIdx.x) + 96) / 44)) < 44) {\n PaddedInput_shared[((((int)threadIdx.x) + 96))] = (((((((((int)blockIdx.x) % 11) * 4) + ((((int)threadIdx.x) + 96) / 44)) < 43) && (1 <= ((((int)threadIdx.x) + 8) % 44))) && (((((int)threadIdx.x) + 8) % 44) < 43)) ? data[(((((((((int)blockIdx.x) / 11) * 1764) + ((((int)blockIdx.x) % 11) * 168)) + (((((int)threadIdx.x) + 96) / 44) * 42)) + ((((int)threadIdx.x) + 8) % 44)) - 43))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 72) {\n if ((((((int)blockIdx.x) % 11) * 4) + ((((int)threadIdx.x) + 192) / 44)) < 44) {\n PaddedInput_shared[((((int)threadIdx.x) + 192))] = (((((((((int)blockIdx.x) % 11) * 4) + ((((int)threadIdx.x) + 192) / 44)) < 43) && (1 <= ((((int)threadIdx.x) + 16) % 44))) && (((((int)threadIdx.x) + 16) % 44) < 43)) ? data[(((((((((int)blockIdx.x) / 11) * 1764) + ((((int)blockIdx.x) % 11) * 168)) + (((((int)threadIdx.x) + 192) / 44) * 42)) + ((((int)threadIdx.x) + 16) % 44)) - 43))] : 0.000000e+00f);\n }\n }\n if (((int)threadIdx.x) < 9) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 924) / 11) * 9) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 9; ++k_inner_outer) {\n if (((((((int)blockIdx.x) % 11) * 4) + (((int)threadIdx.x) / 24)) + (k_inner_outer / 3)) < 44) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[((((((((int)threadIdx.x) / 24) * 44) + ((k_inner_outer / 3) * 44)) + (((int)threadIdx.x) % 24)) + (k_inner_outer % 3)))];\n if (((((int)threadIdx.x) % 24) + (k_inner_outer % 3)) < 20) {\n PaddedInput_shared_local[(1)] = PaddedInput_shared[(((((((((int)threadIdx.x) / 24) * 44) + ((k_inner_outer / 3) * 44)) + (((int)threadIdx.x) % 24)) + (k_inner_outer % 3)) + 24))];\n }\n }\n compute_shared_local[(0)] = compute_shared[(k_inner_outer)];\n if ((((((int)blockIdx.x) % 11) * 4) + (((int)threadIdx.x) / 24)) < 42) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n if ((((int)threadIdx.x) % 24) < 18) {\n DepthwiseConv2d_local[(1)] = (DepthwiseConv2d_local[(1)] + (PaddedInput_shared_local[(1)] * compute_shared_local[(0)]));\n }\n }\n }\n if ((((((int)blockIdx.x) % 11) * 4) + (((int)threadIdx.x) / 24)) < 42) {\n compute[((((((((int)blockIdx.x) / 11) * 1764) + ((((int)blockIdx.x) % 11) * 168)) + ((((int)threadIdx.x) / 24) * 42)) + (((int)threadIdx.x) % 24)))] = DepthwiseConv2d_local[(0)];\n if ((((int)threadIdx.x) % 24) < 18) {\n compute[(((((((((int)blockIdx.x) / 11) * 1764) + ((((int)blockIdx.x) % 11) * 168)) + ((((int)threadIdx.x) / 24) * 42)) + (((int)threadIdx.x) % 24)) + 24))] = DepthwiseConv2d_local[(1)];\n }\n }\n}\n", "gridDim": [118272, 1, 1], "blockDim": [96, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,42,42]_[5,5,84,1]_[128,84,42,42].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,42,42]_[5,5,84,1]_[128,84,42,42].json new file mode 100644 index 000000000..07ce33f52 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,42,42]_[5,5,84,1]_[128,84,42,42].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 84, 42, 42], "filter_shape": [84, 1, 5, 5], "output_shape": [128, 84, 42, 42], "window_movement_strides": [1, 1], "padding_below_diff": [2, 2], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_84_42_42___5_5_84_1___128_84_42_42_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_84_42_42___5_5_84_1___128_84_42_42_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[1];\n __shared__ float PaddedInput_shared[460];\n __shared__ float compute_shared[25];\n float PaddedInput_shared_local[1];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = ((((2 <= (((((int)blockIdx.x) % 7) * 6) + (((int)threadIdx.x) / 46))) && (2 <= (((int)threadIdx.x) % 46))) && ((((int)threadIdx.x) % 46) < 44)) ? data[(((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 46) * 42)) + (((int)threadIdx.x) % 46)) - 86))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 172) {\n PaddedInput_shared[((((int)threadIdx.x) + 288))] = (((((((((int)blockIdx.x) % 7) * 6) + ((((int)threadIdx.x) + 288) / 46)) < 44) && (2 <= ((((int)threadIdx.x) + 12) % 46))) && (((((int)threadIdx.x) + 12) % 46) < 44)) ? data[(((((((int)blockIdx.x) * 252) + (((((int)threadIdx.x) + 288) / 46) * 42)) + ((((int)threadIdx.x) + 12) % 46)) - 86))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 25) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 588) / 7) * 25) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 25; ++k_inner_outer) {\n if (((((int)threadIdx.x) % 48) + (k_inner_outer % 5)) < 46) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[((((((((int)threadIdx.x) / 48) * 46) + ((k_inner_outer / 5) * 46)) + (((int)threadIdx.x) % 48)) + (k_inner_outer % 5)))];\n }\n compute_shared_local[(0)] = compute_shared[(k_inner_outer)];\n if ((((int)threadIdx.x) % 48) < 42) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n }\n }\n if ((((int)threadIdx.x) % 48) < 42) {\n compute[((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 48) * 42)) + (((int)threadIdx.x) % 48)))] = DepthwiseConv2d_local[(0)];\n }\n}\n", "gridDim": [75264, 1, 1], "blockDim": [288, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,42,42]_[7,7,84,1]_[128,84,42,42].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,42,42]_[7,7,84,1]_[128,84,42,42].json new file mode 100644 index 000000000..edf3ae6b0 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,42,42]_[7,7,84,1]_[128,84,42,42].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 84, 42, 42], "filter_shape": [84, 1, 7, 7], "output_shape": [128, 84, 42, 42], "window_movement_strides": [1, 1], "padding_below_diff": [3, 3], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_84_42_42___7_7_84_1___128_84_42_42_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_84_42_42___7_7_84_1___128_84_42_42_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[1];\n __shared__ float PaddedInput_shared[576];\n __shared__ float compute_shared[49];\n float PaddedInput_shared_local[1];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = ((((3 <= (((((int)blockIdx.x) % 7) * 6) + (((int)threadIdx.x) / 48))) && (3 <= (((int)threadIdx.x) % 48))) && ((((int)threadIdx.x) % 48) < 45)) ? data[(((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 48) * 42)) + (((int)threadIdx.x) % 48)) - 129))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 288))] = (((((((((int)blockIdx.x) % 7) * 6) + (((int)threadIdx.x) / 48)) < 39) && (3 <= (((int)threadIdx.x) % 48))) && ((((int)threadIdx.x) % 48) < 45)) ? data[(((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 48) * 42)) + (((int)threadIdx.x) % 48)) + 123))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 49) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 588) / 7) * 49) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 49; ++k_inner_outer) {\n if (((((int)threadIdx.x) % 48) + (k_inner_outer % 7)) < 48) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[(((((k_inner_outer / 7) * 48) + ((int)threadIdx.x)) + (k_inner_outer % 7)))];\n }\n compute_shared_local[(0)] = compute_shared[(k_inner_outer)];\n if ((((int)threadIdx.x) % 48) < 42) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n }\n }\n if ((((int)threadIdx.x) % 48) < 42) {\n compute[((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 48) * 42)) + (((int)threadIdx.x) % 48)))] = DepthwiseConv2d_local[(0)];\n }\n}\n", "gridDim": [75264, 1, 1], "blockDim": [288, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,83,83]_[5,5,84,1]_[128,84,42,42].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,83,83]_[5,5,84,1]_[128,84,42,42].json new file mode 100644 index 000000000..0d8f2f01b --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,83,83]_[5,5,84,1]_[128,84,42,42].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 84, 83, 83], "filter_shape": [84, 1, 5, 5], "output_shape": [128, 84, 42, 42], "window_movement_strides": [2, 2], "padding_below_diff": [2, 2], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_84_83_83___5_5_84_1___128_84_42_42_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_84_83_83___5_5_84_1___128_84_42_42_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[1];\n __shared__ float PaddedInput_shared[1305];\n __shared__ float compute_shared[25];\n float PaddedInput_shared_local[1];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = ((((2 <= (((((int)blockIdx.x) % 7) * 12) + (((int)threadIdx.x) / 87))) && (2 <= (((int)threadIdx.x) % 87))) && ((((int)threadIdx.x) % 87) < 85)) ? data[(((((((((int)blockIdx.x) / 7) * 6889) + ((((int)blockIdx.x) % 7) * 996)) + ((((int)threadIdx.x) / 87) * 83)) + (((int)threadIdx.x) % 87)) - 168))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 288))] = (((2 <= ((((int)threadIdx.x) + 27) % 87)) && (((((int)threadIdx.x) + 27) % 87) < 85)) ? data[(((((((((int)blockIdx.x) / 7) * 6889) + ((((int)blockIdx.x) % 7) * 996)) + (((((int)threadIdx.x) + 288) / 87) * 83)) + ((((int)threadIdx.x) + 27) % 87)) - 168))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 576))] = (((2 <= ((((int)threadIdx.x) + 54) % 87)) && (((((int)threadIdx.x) + 54) % 87) < 85)) ? data[(((((((((int)blockIdx.x) / 7) * 6889) + ((((int)blockIdx.x) % 7) * 996)) + (((((int)threadIdx.x) + 576) / 87) * 83)) + ((((int)threadIdx.x) + 54) % 87)) - 168))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 864))] = (((((((((int)blockIdx.x) % 7) * 12) + ((((int)threadIdx.x) + 864) / 87)) < 85) && (2 <= ((((int)threadIdx.x) + 81) % 87))) && (((((int)threadIdx.x) + 81) % 87) < 85)) ? data[(((((((((int)blockIdx.x) / 7) * 6889) + ((((int)blockIdx.x) % 7) * 996)) + (((((int)threadIdx.x) + 864) / 87) * 83)) + ((((int)threadIdx.x) + 81) % 87)) - 168))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 153) {\n PaddedInput_shared[((((int)threadIdx.x) + 1152))] = (((((((((int)blockIdx.x) % 7) * 12) + ((((int)threadIdx.x) + 1152) / 87)) < 85) && (2 <= ((((int)threadIdx.x) + 21) % 87))) && (((((int)threadIdx.x) + 21) % 87) < 85)) ? data[(((((((((int)blockIdx.x) / 7) * 6889) + ((((int)blockIdx.x) % 7) * 996)) + (((((int)threadIdx.x) + 1152) / 87) * 83)) + ((((int)threadIdx.x) + 21) % 87)) - 168))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 25) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 588) / 7) * 25) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 25; ++k_inner_outer) {\n if ((((((int)threadIdx.x) % 48) * 2) + (k_inner_outer % 5)) < 87) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[((((((((int)threadIdx.x) / 48) * 174) + ((k_inner_outer / 5) * 87)) + ((((int)threadIdx.x) % 48) * 2)) + (k_inner_outer % 5)))];\n }\n compute_shared_local[(0)] = compute_shared[(k_inner_outer)];\n if ((((int)threadIdx.x) % 48) < 42) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n }\n }\n if ((((int)threadIdx.x) % 48) < 42) {\n compute[((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 48) * 42)) + (((int)threadIdx.x) % 48)))] = DepthwiseConv2d_local[(0)];\n }\n}\n", "gridDim": [75264, 1, 1], "blockDim": [288, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,83,83]_[7,7,84,1]_[128,84,42,42].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,83,83]_[7,7,84,1]_[128,84,42,42].json new file mode 100644 index 000000000..63bd0ab32 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,84,83,83]_[7,7,84,1]_[128,84,42,42].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 84, 83, 83], "filter_shape": [84, 1, 7, 7], "output_shape": [128, 84, 42, 42], "window_movement_strides": [2, 2], "padding_below_diff": [3, 3], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_84_83_83___7_7_84_1___128_84_42_42_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_84_83_83___7_7_84_1___128_84_42_42_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[1];\n __shared__ float PaddedInput_shared[1513];\n __shared__ float compute_shared[49];\n float PaddedInput_shared_local[1];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = ((((3 <= (((((int)blockIdx.x) % 7) * 12) + (((int)threadIdx.x) / 89))) && (3 <= (((int)threadIdx.x) % 89))) && ((((int)threadIdx.x) % 89) < 86)) ? data[(((((((((int)blockIdx.x) / 7) * 6889) + ((((int)blockIdx.x) % 7) * 996)) + ((((int)threadIdx.x) / 89) * 83)) + (((int)threadIdx.x) % 89)) - 252))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 288))] = (((3 <= ((((int)threadIdx.x) + 21) % 89)) && (((((int)threadIdx.x) + 21) % 89) < 86)) ? data[(((((((((int)blockIdx.x) / 7) * 6889) + ((((int)blockIdx.x) % 7) * 996)) + (((((int)threadIdx.x) + 288) / 89) * 83)) + ((((int)threadIdx.x) + 21) % 89)) - 252))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 576))] = (((3 <= ((((int)threadIdx.x) + 42) % 89)) && (((((int)threadIdx.x) + 42) % 89) < 86)) ? data[(((((((((int)blockIdx.x) / 7) * 6889) + ((((int)blockIdx.x) % 7) * 996)) + (((((int)threadIdx.x) + 576) / 89) * 83)) + ((((int)threadIdx.x) + 42) % 89)) - 252))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 864))] = (((3 <= ((((int)threadIdx.x) + 63) % 89)) && (((((int)threadIdx.x) + 63) % 89) < 86)) ? data[(((((((((int)blockIdx.x) / 7) * 6889) + ((((int)blockIdx.x) % 7) * 996)) + (((((int)threadIdx.x) + 864) / 89) * 83)) + ((((int)threadIdx.x) + 63) % 89)) - 252))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 1152))] = (((((((((int)blockIdx.x) % 7) * 12) + ((((int)threadIdx.x) + 1152) / 89)) < 86) && (3 <= ((((int)threadIdx.x) + 84) % 89))) && (((((int)threadIdx.x) + 84) % 89) < 86)) ? data[(((((((((int)blockIdx.x) / 7) * 6889) + ((((int)blockIdx.x) % 7) * 996)) + (((((int)threadIdx.x) + 1152) / 89) * 83)) + ((((int)threadIdx.x) + 84) % 89)) - 252))] : 0.000000e+00f);\n if (((int)threadIdx.x) < 73) {\n PaddedInput_shared[((((int)threadIdx.x) + 1440))] = ((((((((int)blockIdx.x) % 7) * 12) + ((((int)threadIdx.x) + 1440) / 89)) < 86) && (((int)threadIdx.x) < 70)) ? data[(((((((((int)blockIdx.x) / 7) * 6889) + ((((int)blockIdx.x) % 7) * 996)) + (((((int)threadIdx.x) + 1440) / 89) * 83)) + (((int)threadIdx.x) + 16)) - 252))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 49) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 588) / 7) * 49) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 49; ++k_inner_outer) {\n if ((((((int)threadIdx.x) % 48) * 2) + (k_inner_outer % 7)) < 89) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[((((((((int)threadIdx.x) / 48) * 178) + ((k_inner_outer / 7) * 89)) + ((((int)threadIdx.x) % 48) * 2)) + (k_inner_outer % 7)))];\n }\n compute_shared_local[(0)] = compute_shared[(k_inner_outer)];\n if ((((int)threadIdx.x) % 48) < 42) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n }\n }\n if ((((int)threadIdx.x) % 48) < 42) {\n compute[((((((int)blockIdx.x) * 252) + ((((int)threadIdx.x) / 48) * 42)) + (((int)threadIdx.x) % 48)))] = DepthwiseConv2d_local[(0)];\n }\n}\n", "gridDim": [75264, 1, 1], "blockDim": [288, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,96,165,165]_[5,5,96,1]_[128,96,83,83].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,96,165,165]_[5,5,96,1]_[128,96,83,83].json new file mode 100644 index 000000000..abbe1b330 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,96,165,165]_[5,5,96,1]_[128,96,83,83].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 96, 165, 165], "filter_shape": [96, 1, 5, 5], "output_shape": [128, 96, 83, 83], "window_movement_strides": [2, 2], "padding_below_diff": [2, 2], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_96_165_165___5_5_96_1___128_96_83_83_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_96_165_165___5_5_96_1___128_96_83_83_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[2];\n __shared__ float PaddedInput_shared[3211];\n __shared__ float compute_shared[25];\n float PaddedInput_shared_local[2];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n DepthwiseConv2d_local[(1)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = ((((2 <= (((((int)blockIdx.x) % 11) * 16) + (((int)threadIdx.x) / 169))) && (2 <= (((int)threadIdx.x) % 169))) && ((((int)threadIdx.x) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + ((((int)threadIdx.x) / 169) * 165)) + (((int)threadIdx.x) % 169)) - 332))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 352))] = (((2 <= ((((int)threadIdx.x) + 14) % 169)) && (((((int)threadIdx.x) + 14) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 352) / 169) * 165)) + ((((int)threadIdx.x) + 14) % 169)) - 332))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 704))] = (((2 <= ((((int)threadIdx.x) + 28) % 169)) && (((((int)threadIdx.x) + 28) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 704) / 169) * 165)) + ((((int)threadIdx.x) + 28) % 169)) - 332))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 1056))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 1056) / 169)) < 167) && (2 <= ((((int)threadIdx.x) + 42) % 169))) && (((((int)threadIdx.x) + 42) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 1056) / 169) * 165)) + ((((int)threadIdx.x) + 42) % 169)) - 332))] : 0.000000e+00f);\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 1408) / 169)) < 169) {\n PaddedInput_shared[((((int)threadIdx.x) + 1408))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 1408) / 169)) < 167) && (2 <= ((((int)threadIdx.x) + 56) % 169))) && (((((int)threadIdx.x) + 56) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 1408) / 169) * 165)) + ((((int)threadIdx.x) + 56) % 169)) - 332))] : 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 1760) / 169)) < 169) {\n PaddedInput_shared[((((int)threadIdx.x) + 1760))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 1760) / 169)) < 167) && (2 <= ((((int)threadIdx.x) + 70) % 169))) && (((((int)threadIdx.x) + 70) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 1760) / 169) * 165)) + ((((int)threadIdx.x) + 70) % 169)) - 332))] : 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2112) / 169)) < 169) {\n PaddedInput_shared[((((int)threadIdx.x) + 2112))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2112) / 169)) < 167) && (2 <= ((((int)threadIdx.x) + 84) % 169))) && (((((int)threadIdx.x) + 84) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 2112) / 169) * 165)) + ((((int)threadIdx.x) + 84) % 169)) - 332))] : 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2464) / 169)) < 169) {\n PaddedInput_shared[((((int)threadIdx.x) + 2464))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2464) / 169)) < 167) && (2 <= ((((int)threadIdx.x) + 98) % 169))) && (((((int)threadIdx.x) + 98) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 2464) / 169) * 165)) + ((((int)threadIdx.x) + 98) % 169)) - 332))] : 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2816) / 169)) < 169) {\n PaddedInput_shared[((((int)threadIdx.x) + 2816))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2816) / 169)) < 167) && (2 <= ((((int)threadIdx.x) + 112) % 169))) && (((((int)threadIdx.x) + 112) % 169) < 167)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 2816) / 169) * 165)) + ((((int)threadIdx.x) + 112) % 169)) - 332))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 43) {\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 3168) / 169)) < 169) {\n PaddedInput_shared[((((int)threadIdx.x) + 3168))] = ((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 3168) / 169)) < 167) && (((int)threadIdx.x) < 41)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 3168) / 169) * 165)) + (((int)threadIdx.x) + 126)) - 332))] : 0.000000e+00f);\n }\n }\n if (((int)threadIdx.x) < 25) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 1056) / 11) * 25) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 25; ++k_inner_outer) {\n if (((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) / 44) * 2)) + (k_inner_outer / 5)) < 169) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[((((((((int)threadIdx.x) / 44) * 338) + ((k_inner_outer / 5) * 169)) + ((((int)threadIdx.x) % 44) * 2)) + (k_inner_outer % 5)))];\n if ((((((int)threadIdx.x) % 44) * 2) + (k_inner_outer % 5)) < 81) {\n PaddedInput_shared_local[(1)] = PaddedInput_shared[(((((((((int)threadIdx.x) / 44) * 338) + ((k_inner_outer / 5) * 169)) + ((((int)threadIdx.x) % 44) * 2)) + (k_inner_outer % 5)) + 88))];\n }\n }\n compute_shared_local[(0)] = compute_shared[(k_inner_outer)];\n if ((((((int)blockIdx.x) % 11) * 8) + (((int)threadIdx.x) / 44)) < 83) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n if ((((int)threadIdx.x) % 44) < 39) {\n DepthwiseConv2d_local[(1)] = (DepthwiseConv2d_local[(1)] + (PaddedInput_shared_local[(1)] * compute_shared_local[(0)]));\n }\n }\n }\n if ((((((int)blockIdx.x) % 11) * 8) + (((int)threadIdx.x) / 44)) < 83) {\n compute[((((((((int)blockIdx.x) / 11) * 6889) + ((((int)blockIdx.x) % 11) * 664)) + ((((int)threadIdx.x) / 44) * 83)) + (((int)threadIdx.x) % 44)))] = DepthwiseConv2d_local[(0)];\n if ((((int)threadIdx.x) % 44) < 39) {\n compute[(((((((((int)blockIdx.x) / 11) * 6889) + ((((int)blockIdx.x) % 11) * 664)) + ((((int)threadIdx.x) / 44) * 83)) + (((int)threadIdx.x) % 44)) + 44))] = DepthwiseConv2d_local[(1)];\n }\n }\n}\n", "gridDim": [135168, 1, 1], "blockDim": [352, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,96,165,165]_[7,7,96,1]_[128,96,83,83].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,96,165,165]_[7,7,96,1]_[128,96,83,83].json new file mode 100644 index 000000000..aea283913 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_DepthwiseConv2dNative_[128,96,165,165]_[7,7,96,1]_[128,96,83,83].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 96, 165, 165], "filter_shape": [96, 1, 7, 7], "output_shape": [128, 96, 83, 83], "window_movement_strides": [2, 2], "padding_below_diff": [3, 3], "window_dilation_strides": [1, 1]}, "op_type": "DepthwiseConv2dNative", "tvm_func_name": "roller_DepthwiseConv2dNative__128_96_165_165___7_7_96_1___128_96_83_83_", "code": "extern \"C\" __global__ void roller_DepthwiseConv2dNative__128_96_165_165___7_7_96_1___128_96_83_83_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute) {\n float DepthwiseConv2d_local[2];\n __shared__ float PaddedInput_shared[3591];\n __shared__ float compute_shared[49];\n float PaddedInput_shared_local[2];\n float compute_shared_local[1];\n DepthwiseConv2d_local[(0)] = 0.000000e+00f;\n DepthwiseConv2d_local[(1)] = 0.000000e+00f;\n PaddedInput_shared[(((int)threadIdx.x))] = ((((3 <= (((((int)blockIdx.x) % 11) * 16) + (((int)threadIdx.x) / 171))) && (3 <= (((int)threadIdx.x) % 171))) && ((((int)threadIdx.x) % 171) < 168)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + ((((int)threadIdx.x) / 171) * 165)) + (((int)threadIdx.x) % 171)) - 498))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 352))] = ((((3 <= (((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 352) / 171))) && (3 <= ((((int)threadIdx.x) + 10) % 171))) && (((((int)threadIdx.x) + 10) % 171) < 168)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 352) / 171) * 165)) + ((((int)threadIdx.x) + 10) % 171)) - 498))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 704))] = (((3 <= ((((int)threadIdx.x) + 20) % 171)) && (((((int)threadIdx.x) + 20) % 171) < 168)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 704) / 171) * 165)) + ((((int)threadIdx.x) + 20) % 171)) - 498))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 1056))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 1056) / 171)) < 168) && (3 <= ((((int)threadIdx.x) + 30) % 171))) && (((((int)threadIdx.x) + 30) % 171) < 168)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 1056) / 171) * 165)) + ((((int)threadIdx.x) + 30) % 171)) - 498))] : 0.000000e+00f);\n PaddedInput_shared[((((int)threadIdx.x) + 1408))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 1408) / 171)) < 168) && (3 <= ((((int)threadIdx.x) + 40) % 171))) && (((((int)threadIdx.x) + 40) % 171) < 168)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 1408) / 171) * 165)) + ((((int)threadIdx.x) + 40) % 171)) - 498))] : 0.000000e+00f);\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 1760) / 171)) < 171) {\n PaddedInput_shared[((((int)threadIdx.x) + 1760))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 1760) / 171)) < 168) && (3 <= ((((int)threadIdx.x) + 50) % 171))) && (((((int)threadIdx.x) + 50) % 171) < 168)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 1760) / 171) * 165)) + ((((int)threadIdx.x) + 50) % 171)) - 498))] : 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2112) / 171)) < 171) {\n PaddedInput_shared[((((int)threadIdx.x) + 2112))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2112) / 171)) < 168) && (3 <= ((((int)threadIdx.x) + 60) % 171))) && (((((int)threadIdx.x) + 60) % 171) < 168)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 2112) / 171) * 165)) + ((((int)threadIdx.x) + 60) % 171)) - 498))] : 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2464) / 171)) < 171) {\n PaddedInput_shared[((((int)threadIdx.x) + 2464))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2464) / 171)) < 168) && (3 <= ((((int)threadIdx.x) + 70) % 171))) && (((((int)threadIdx.x) + 70) % 171) < 168)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 2464) / 171) * 165)) + ((((int)threadIdx.x) + 70) % 171)) - 498))] : 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2816) / 171)) < 171) {\n PaddedInput_shared[((((int)threadIdx.x) + 2816))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 2816) / 171)) < 168) && (3 <= ((((int)threadIdx.x) + 80) % 171))) && (((((int)threadIdx.x) + 80) % 171) < 168)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 2816) / 171) * 165)) + ((((int)threadIdx.x) + 80) % 171)) - 498))] : 0.000000e+00f);\n }\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 3168) / 171)) < 171) {\n PaddedInput_shared[((((int)threadIdx.x) + 3168))] = (((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 3168) / 171)) < 168) && (3 <= ((((int)threadIdx.x) + 90) % 171))) && (((((int)threadIdx.x) + 90) % 171) < 168)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 3168) / 171) * 165)) + ((((int)threadIdx.x) + 90) % 171)) - 498))] : 0.000000e+00f);\n }\n if (((int)threadIdx.x) < 71) {\n if ((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 3520) / 171)) < 171) {\n PaddedInput_shared[((((int)threadIdx.x) + 3520))] = ((((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) + 3520) / 171)) < 168) && (((int)threadIdx.x) < 68)) ? data[(((((((((int)blockIdx.x) / 11) * 27225) + ((((int)blockIdx.x) % 11) * 2640)) + (((((int)threadIdx.x) + 3520) / 171) * 165)) + (((int)threadIdx.x) + 100)) - 498))] : 0.000000e+00f);\n }\n }\n if (((int)threadIdx.x) < 49) {\n compute_shared[(((int)threadIdx.x))] = kernel[(((((((int)blockIdx.x) % 1056) / 11) * 49) + ((int)threadIdx.x)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 49; ++k_inner_outer) {\n if (((((((int)blockIdx.x) % 11) * 16) + ((((int)threadIdx.x) / 44) * 2)) + (k_inner_outer / 7)) < 171) {\n PaddedInput_shared_local[(0)] = PaddedInput_shared[((((((((int)threadIdx.x) / 44) * 342) + ((k_inner_outer / 7) * 171)) + ((((int)threadIdx.x) % 44) * 2)) + (k_inner_outer % 7)))];\n if ((((((int)threadIdx.x) % 44) * 2) + (k_inner_outer % 7)) < 83) {\n PaddedInput_shared_local[(1)] = PaddedInput_shared[(((((((((int)threadIdx.x) / 44) * 342) + ((k_inner_outer / 7) * 171)) + ((((int)threadIdx.x) % 44) * 2)) + (k_inner_outer % 7)) + 88))];\n }\n }\n compute_shared_local[(0)] = compute_shared[(k_inner_outer)];\n if ((((((int)blockIdx.x) % 11) * 8) + (((int)threadIdx.x) / 44)) < 83) {\n DepthwiseConv2d_local[(0)] = (DepthwiseConv2d_local[(0)] + (PaddedInput_shared_local[(0)] * compute_shared_local[(0)]));\n if ((((int)threadIdx.x) % 44) < 39) {\n DepthwiseConv2d_local[(1)] = (DepthwiseConv2d_local[(1)] + (PaddedInput_shared_local[(1)] * compute_shared_local[(0)]));\n }\n }\n }\n if ((((((int)blockIdx.x) % 11) * 8) + (((int)threadIdx.x) / 44)) < 83) {\n compute[((((((((int)blockIdx.x) / 11) * 6889) + ((((int)blockIdx.x) % 11) * 664)) + ((((int)threadIdx.x) / 44) * 83)) + (((int)threadIdx.x) % 44)))] = DepthwiseConv2d_local[(0)];\n if ((((int)threadIdx.x) % 44) < 39) {\n compute[(((((((((int)blockIdx.x) / 11) * 6889) + ((((int)blockIdx.x) % 11) * 664)) + ((((int)threadIdx.x) / 44) * 83)) + (((int)threadIdx.x) % 44)) + 44))] = DepthwiseConv2d_local[(1)];\n }\n }\n}\n", "gridDim": [135168, 1, 1], "blockDim": [352, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Dot_[128,4032]_[4032,1000]_[128,1000].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Dot_[128,4032]_[4032,1000]_[128,1000].json new file mode 100644 index 000000000..6f7b51d3e --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Dot_[128,4032]_[4032,1000]_[128,1000].json @@ -0,0 +1 @@ +{"parameters": {"arg0_shape": [128, 4032], "arg1_shape": [4032, 1000], "out_shape": [128, 1000], "transpose_A": false, "transpose_B": false}, "op_type": "Dot", "tvm_func_name": "roller_Dot__128_4032___4032_1000___128_1000_", "code": "extern \"C\" __global__ void roller_Dot__128_4032___4032_1000___128_1000_(float* __restrict__ A, float* __restrict__ B, float* __restrict__ compute) {\n float compute_local[4];\n __shared__ float A_shared[528];\n __shared__ float B_shared[1024];\n float A_shared_local[2];\n float B_shared_local[2];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 126; ++k_outer) {\n __syncthreads();\n A_shared[((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)))] = A[((((((((int)blockIdx.x) >> 5) * 64512) + ((((int)threadIdx.x) >> 5) * 4032)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 132))] = A[(((((((((int)blockIdx.x) >> 5) * 64512) + ((((int)threadIdx.x) >> 5) * 4032)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16128))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 264))] = A[(((((((((int)blockIdx.x) >> 5) * 64512) + ((((int)threadIdx.x) >> 5) * 4032)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32256))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 396))] = A[(((((((((int)blockIdx.x) >> 5) * 64512) + ((((int)threadIdx.x) >> 5) * 4032)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 48384))];\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 31)) < 1000) {\n B_shared[(((int)threadIdx.x))] = B[(((((k_outer * 32000) + ((((int)threadIdx.x) >> 5) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 31)))];\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 31)) < 1000) {\n B_shared[((((int)threadIdx.x) + 128))] = B[((((((k_outer * 32000) + ((((int)threadIdx.x) >> 5) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 31)) + 4000))];\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 31)) < 1000) {\n B_shared[((((int)threadIdx.x) + 256))] = B[((((((k_outer * 32000) + ((((int)threadIdx.x) >> 5) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 31)) + 8000))];\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 31)) < 1000) {\n B_shared[((((int)threadIdx.x) + 384))] = B[((((((k_outer * 32000) + ((((int)threadIdx.x) >> 5) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 31)) + 12000))];\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 31)) < 1000) {\n B_shared[((((int)threadIdx.x) + 512))] = B[((((((k_outer * 32000) + ((((int)threadIdx.x) >> 5) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 31)) + 16000))];\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 31)) < 1000) {\n B_shared[((((int)threadIdx.x) + 640))] = B[((((((k_outer * 32000) + ((((int)threadIdx.x) >> 5) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 31)) + 20000))];\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 31)) < 1000) {\n B_shared[((((int)threadIdx.x) + 768))] = B[((((((k_outer * 32000) + ((((int)threadIdx.x) >> 5) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 31)) + 24000))];\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 31)) < 1000) {\n B_shared[((((int)threadIdx.x) + 896))] = B[((((((k_outer * 32000) + ((((int)threadIdx.x) >> 5) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 31)) + 28000))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n A_shared_local[(0)] = A_shared[((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer))];\n A_shared_local[(1)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 264))];\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 15)) < 1000) {\n B_shared_local[(0)] = B_shared[(((k_inner_outer * 32) + (((int)threadIdx.x) & 15)))];\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 15)) < 984) {\n B_shared_local[(1)] = B_shared[((((k_inner_outer * 32) + (((int)threadIdx.x) & 15)) + 16))];\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 15)) < 1000) {\n compute_local[(0)] = (compute_local[(0)] + (A_shared_local[(0)] * B_shared_local[(0)]));\n compute_local[(2)] = (compute_local[(2)] + (A_shared_local[(1)] * B_shared_local[(0)]));\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 15)) < 984) {\n compute_local[(1)] = (compute_local[(1)] + (A_shared_local[(0)] * B_shared_local[(1)]));\n compute_local[(3)] = (compute_local[(3)] + (A_shared_local[(1)] * B_shared_local[(1)]));\n }\n }\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 15)) < 1000) {\n compute[((((((((int)blockIdx.x) >> 5) * 16000) + ((((int)threadIdx.x) >> 4) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 15)))] = compute_local[(0)];\n compute[(((((((((int)blockIdx.x) >> 5) * 16000) + ((((int)threadIdx.x) >> 4) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 15)) + 8000))] = compute_local[(2)];\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 15)) < 984) {\n compute[(((((((((int)blockIdx.x) >> 5) * 16000) + ((((int)threadIdx.x) >> 4) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 15)) + 16))] = compute_local[(1)];\n compute[(((((((((int)blockIdx.x) >> 5) * 16000) + ((((int)threadIdx.x) >> 4) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 15)) + 8016))] = compute_local[(3)];\n }\n}\n", "gridDim": [256, 1, 1], "blockDim": [128, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_nas/roller_Sum_[128,4032,11,11]_[128,4032].json b/src/tools/nnfusion/kernel_db/roller_nas/roller_Sum_[128,4032,11,11]_[128,4032].json new file mode 100644 index 000000000..46c20ee7a --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_nas/roller_Sum_[128,4032,11,11]_[128,4032].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 4032, 11, 11], "output_shape": [128, 4032], "reduction_axis": [2, 3]}, "op_type": "Sum", "tvm_func_name": "roller_Sum__128_4032_11_11___128_4032_", "code": "extern \"C\" __global__ void roller_Sum__128_4032_11_11___128_4032_(float* __restrict__ A, float* __restrict__ compute) {\n float compute_local[1];\n __shared__ float A_shared[6144];\n float A_shared_local[1];\n compute_local[(0)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 2; ++k_outer) {\n __syncthreads();\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[(((int)threadIdx.x))] = A[(((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 96))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 96) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 192))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 363))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 288))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 288) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 384))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 726))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 480))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 480) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 576))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 1089))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 672))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 672) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 768))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 1452))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 864))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 864) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 960))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 1815))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 1056))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 1056) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 1152))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 2178))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 1248))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 1248) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 1344))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 2541))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 1440))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 1440) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 1536))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 2904))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 1632))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 1632) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 1728))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 3267))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 1824))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 1824) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 1920))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 3630))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 2016))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 2016) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 2112))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 3993))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 2208))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 2208) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 2304))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 4356))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 2400))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 2400) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 2496))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 4719))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 2592))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 2592) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 2688))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 5082))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 2784))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 2784) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 2880))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 5445))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 2976))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 2976) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 3072))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 5808))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 3168))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 3168) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 3264))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 6171))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 3360))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 3360) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 3456))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 6534))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 3552))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 3552) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 3648))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 6897))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 3744))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 3744) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 3840))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 7260))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 3936))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 3936) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 4032))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 7623))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 4128))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 4128) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 4224))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 7986))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 4320))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 4320) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 4416))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 8349))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 4512))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 4512) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 4608))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 8712))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 4704))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 4704) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 4800))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 9075))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 4896))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 4896) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 4992))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 9438))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 5088))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 5088) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 5184))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 9801))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 5280))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 5280) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 5376))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 10164))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 5472))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 5472) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 5568))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 10527))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 5664))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 5664) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 5760))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 10890))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 5856))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 5856) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n if (((k_outer * 64) + (((int)threadIdx.x) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 5952))] = A[((((((((int)blockIdx.x) * 11616) + ((((int)threadIdx.x) >> 6) * 121)) + (k_outer * 64)) + (((int)threadIdx.x) & 63)) + 11253))];\n }\n if (((k_outer * 64) + ((((int)threadIdx.x) + 32) & 63)) < 121) {\n A_shared[((((int)threadIdx.x) + 6048))] = A[(((((((int)blockIdx.x) * 11616) + (((((int)threadIdx.x) + 6048) >> 6) * 121)) + (k_outer * 64)) + ((((int)threadIdx.x) + 32) & 63)))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 64; ++k_inner_outer) {\n if (((k_outer * 64) + k_inner_outer) < 121) {\n A_shared_local[(0)] = A_shared[(((((int)threadIdx.x) * 64) + k_inner_outer))];\n }\n if (((k_outer * 64) + k_inner_outer) < 121) {\n compute_local[(0)] = (compute_local[(0)] + A_shared_local[(0)]);\n }\n }\n }\n compute[(((((int)blockIdx.x) * 96) + ((int)threadIdx.x)))] = compute_local[(0)];\n}\n", "gridDim": [5376, 1, 1], "blockDim": [96, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/convert.sh b/src/tools/nnfusion/kernel_db/roller_res/convert.sh new file mode 100755 index 000000000..06df481f2 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/convert.sh @@ -0,0 +1,26 @@ +python ../convert_external.py roller_Convolution_\[128\,3\,230\,230\]_\[64\,3\,7\,7\]_\[128\,64\,112\,112\].json +python ../convert_external.py roller_Convolution_\[128\,64\,56\,56\]_\[256\,64\,1\,1\]_\[128\,256\,56\,56\].json +python ../convert_external.py roller_Convolution_\[128\,64\,56\,56\]_\[64\,64\,1\,1\]_\[128\,64\,56\,56\].json +python ../convert_external.py roller_Convolution_\[128\,64\,56\,56\]_\[64\,64\,3\,3\]_\[128\,64\,56\,56\].json +python ../convert_external.py roller_Convolution_\[128\,256\,56\,56\]_\[64\,256\,1\,1\]_\[128\,64\,56\,56\].json +python ../convert_external.py roller_Convolution_\[128\,256\,56\,56\]_\[512\,256\,1\,1\]_\[128\,512\,28\,28\].json +python ../convert_external.py roller_Convolution_\[128\,256\,56\,56\]_\[128\,256\,1\,1\]_\[128\,128\,28\,28\].json +python ../convert_external.py roller_Convolution_\[128\,128\,58\,58\]_\[128\,128\,3\,3\]_\[128\,128\,28\,28\].json +python ../convert_external.py roller_Convolution_\[128\,128\,28\,28\]_\[512\,128\,1\,1\]_\[128\,512\,28\,28\].json +python ../convert_external.py roller_Convolution_\[128\,512\,28\,28\]_\[128\,512\,1\,1\]_\[128\,128\,28\,28\].json +python ../convert_external.py roller_Convolution_\[128\,128\,28\,28\]_\[128\,128\,3\,3\]_\[128\,128\,28\,28\].json +python ../convert_external.py roller_Convolution_\[128\,512\,28\,28\]_\[1024\,512\,1\,1\]_\[128\,1024\,14\,14\].json +python ../convert_external.py roller_Convolution_\[128\,512\,28\,28\]_\[256\,512\,1\,1\]_\[128\,256\,28\,28\].json +python ../convert_external.py roller_Convolution_\[128\,256\,30\,30\]_\[256\,256\,3\,3\]_\[128\,256\,14\,14\].json +python ../convert_external.py roller_Convolution_\[128\,256\,14\,14\]_\[1024\,256\,1\,1\]_\[128\,1024\,14\,14\].json +python ../convert_external.py roller_Convolution_\[128\,1024\,14\,14\]_\[256\,1024\,1\,1\]_\[128\,256\,14\,14\].json +python ../convert_external.py roller_Convolution_\[128\,256\,14\,14\]_\[256\,256\,3\,3\]_\[128\,256\,14\,14\].json +python ../convert_external.py roller_Convolution_\[128\,1024\,14\,14\]_\[2048\,1024\,1\,1\]_\[128\,2048\,7\,7\].json +python ../convert_external.py roller_Convolution_\[128\,1024\,14\,14\]_\[512\,1024\,1\,1\]_\[128\,512\,14\,14\].json +python ../convert_external.py roller_Convolution_\[128\,512\,16\,16\]_\[512\,512\,3\,3\]_\[128\,512\,7\,7\].json +python ../convert_external.py roller_Convolution_\[128\,512\,7\,7\]_\[2048\,512\,1\,1\]_\[128\,2048\,7\,7\].json +python ../convert_external.py roller_Convolution_\[128\,2048\,7\,7\]_\[512\,2048\,1\,1\]_\[128\,512\,7\,7\].json +python ../convert_external.py roller_Convolution_\[128\,512\,7\,7\]_\[512\,512\,3\,3\]_\[128\,512\,7\,7\].json +# python ../convert_external.py roller_Dot_\[128\,2048\]_\[2048\,1000\]_\[128\,1000\].json +# python ../convert_external.py roller_MaxPool_\[128\,64\,112\,112\]_\[128\,64\,56\,56\].json +# python ../convert_external.py roller_Sum_\[128\,2048\,7\,7\]_\[128\,2048\].json \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,1024,14,14]_[2048,1024,1,1]_[128,2048,7,7].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,1024,14,14]_[2048,1024,1,1]_[128,2048,7,7].json new file mode 100644 index 000000000..fad3ba614 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,1024,14,14]_[2048,1024,1,1]_[128,2048,7,7].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 1024, 14, 14], "filter_shape": [2048, 1024, 1, 1], "output_shape": [128, 2048, 7, 7], "window_movement_strides": [2, 2], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_1024_14_14___2048_1024_1_1___128_2048_7_7_", "code": "extern \"C\" __global__ void roller_Convolution__128_1024_14_14___2048_1024_1_1___128_2048_7_7_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[2048];\n __shared__ float compute_d_shared[8192];\n float compute_shared_local[8];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 32; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 6) * 196)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 28)) + (((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7) * 2)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 6) * 196)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 28)) + (((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7) * 2)) + 784))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 6) * 196)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 28)) + (((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7) * 2)) + 1568))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 6) * 196)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 28)) + (((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7) * 2)) + 2352))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 6) * 196)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 28)) + (((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7) * 2)) + 3136))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 6) * 196)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 28)) + (((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7) * 2)) + 3920))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 6) * 196)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 28)) + (((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7) * 2)) + 4704))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 6) * 196)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 28)) + (((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7) * 2)) + 5488))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24576))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32768))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 40960))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 49152))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 57344))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 65536))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 73728))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 81920))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 90112))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 98304))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 106496))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 114688))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 122880))];\n compute_d_shared[((((int)threadIdx.x) + 4096))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 131072))];\n compute_d_shared[((((int)threadIdx.x) + 4352))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 139264))];\n compute_d_shared[((((int)threadIdx.x) + 4608))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 147456))];\n compute_d_shared[((((int)threadIdx.x) + 4864))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 155648))];\n compute_d_shared[((((int)threadIdx.x) + 5120))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 163840))];\n compute_d_shared[((((int)threadIdx.x) + 5376))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 172032))];\n compute_d_shared[((((int)threadIdx.x) + 5632))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 180224))];\n compute_d_shared[((((int)threadIdx.x) + 5888))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 188416))];\n compute_d_shared[((((int)threadIdx.x) + 6144))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 196608))];\n compute_d_shared[((((int)threadIdx.x) + 6400))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 204800))];\n compute_d_shared[((((int)threadIdx.x) + 6656))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 212992))];\n compute_d_shared[((((int)threadIdx.x) + 6912))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 221184))];\n compute_d_shared[((((int)threadIdx.x) + 7168))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 229376))];\n compute_d_shared[((((int)threadIdx.x) + 7424))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 237568))];\n compute_d_shared[((((int)threadIdx.x) + 7680))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 245760))];\n compute_d_shared[((((int)threadIdx.x) + 7936))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 253952))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 64) + (((int)threadIdx.x) & 7)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 8))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 16))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 24))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 32))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 40))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 48))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 56))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer) + 4096))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer) + 5120))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer) + 6144))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer) + 7168))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(2)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(4)] * compute_d_shared_local[(3)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(4)] * compute_d_shared_local[(4)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(4)] * compute_d_shared_local[(5)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(4)] * compute_d_shared_local[(6)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(4)] * compute_d_shared_local[(7)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(2)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(5)] * compute_d_shared_local[(3)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(5)] * compute_d_shared_local[(4)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(5)] * compute_d_shared_local[(5)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(5)] * compute_d_shared_local[(6)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(5)] * compute_d_shared_local[(7)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(2)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(6)] * compute_d_shared_local[(3)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(6)] * compute_d_shared_local[(4)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(6)] * compute_d_shared_local[(5)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(6)] * compute_d_shared_local[(6)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(6)] * compute_d_shared_local[(7)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(2)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(7)] * compute_d_shared_local[(3)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(7)] * compute_d_shared_local[(4)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(7)] * compute_d_shared_local[(5)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(7)] * compute_d_shared_local[(6)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(7)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)))] = (compute_local[(0)] + bias[((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200704))] = (compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200704))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401408))] = (compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401408))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602112))] = (compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602112))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 802816))] = (compute_local[(32)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 802816))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1003520))] = (compute_local[(40)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1003520))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1204224))] = (compute_local[(48)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1204224))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1404928))] = (compute_local[(56)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1404928))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 8))] = (compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 8))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200712))] = (compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200712))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401416))] = (compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401416))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602120))] = (compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602120))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 802824))] = (compute_local[(33)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 802824))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1003528))] = (compute_local[(41)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1003528))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1204232))] = (compute_local[(49)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1204232))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1404936))] = (compute_local[(57)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1404936))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 16))] = (compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 16))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200720))] = (compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200720))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401424))] = (compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401424))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602128))] = (compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602128))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 802832))] = (compute_local[(34)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 802832))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1003536))] = (compute_local[(42)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1003536))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1204240))] = (compute_local[(50)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1204240))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1404944))] = (compute_local[(58)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1404944))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 24))] = (compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 24))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200728))] = (compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200728))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401432))] = (compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401432))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602136))] = (compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602136))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 802840))] = (compute_local[(35)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 802840))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1003544))] = (compute_local[(43)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1003544))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1204248))] = (compute_local[(51)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1204248))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1404952))] = (compute_local[(59)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1404952))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 32))] = (compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 32))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200736))] = (compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200736))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401440))] = (compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401440))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602144))] = (compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602144))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 802848))] = (compute_local[(36)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 802848))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1003552))] = (compute_local[(44)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1003552))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1204256))] = (compute_local[(52)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1204256))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1404960))] = (compute_local[(60)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1404960))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 40))] = (compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 40))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200744))] = (compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200744))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401448))] = (compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401448))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602152))] = (compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602152))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 802856))] = (compute_local[(37)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 802856))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1003560))] = (compute_local[(45)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1003560))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1204264))] = (compute_local[(53)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1204264))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1404968))] = (compute_local[(61)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1404968))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 48))] = (compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 48))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200752))] = (compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200752))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401456))] = (compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401456))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602160))] = (compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602160))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 802864))] = (compute_local[(38)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 802864))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1003568))] = (compute_local[(46)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1003568))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1204272))] = (compute_local[(54)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1204272))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1404976))] = (compute_local[(62)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1404976))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 56))] = (compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 56))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200760))] = (compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200760))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401464))] = (compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401464))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602168))] = (compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602168))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 802872))] = (compute_local[(39)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 802872))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1003576))] = (compute_local[(47)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1003576))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1204280))] = (compute_local[(55)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1204280))]);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1404984))] = (compute_local[(63)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 1404984))]);\n}\n", "gridDim": [784, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,1024,14,14]_[256,1024,1,1]_[128,256,14,14].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,1024,14,14]_[256,1024,1,1]_[128,256,14,14].json new file mode 100644 index 000000000..88353e86c --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,1024,14,14]_[256,1024,1,1]_[128,256,14,14].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 1024, 14, 14], "filter_shape": [256, 1024, 1, 1], "output_shape": [128, 256, 14, 14], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_1024_14_14___256_1024_1_1___128_256_14_14_", "code": "extern \"C\" __global__ void roller_Convolution__128_1024_14_14___256_1024_1_1___128_256_14_14_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[8192];\n __shared__ float compute_d_shared[2048];\n float compute_shared_local[8];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 32; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 196))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 392))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 588))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 784))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 980))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 1176))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 1372))];\n compute_shared[((((int)threadIdx.x) + 2048))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 1568))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 1764))];\n compute_shared[((((int)threadIdx.x) + 2560))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 1960))];\n compute_shared[((((int)threadIdx.x) + 2816))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 2156))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 2352))];\n compute_shared[((((int)threadIdx.x) + 3328))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 2548))];\n compute_shared[((((int)threadIdx.x) + 3584))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 2744))];\n compute_shared[((((int)threadIdx.x) + 3840))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 2940))];\n compute_shared[((((int)threadIdx.x) + 4096))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 3136))];\n compute_shared[((((int)threadIdx.x) + 4352))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 3332))];\n compute_shared[((((int)threadIdx.x) + 4608))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 3528))];\n compute_shared[((((int)threadIdx.x) + 4864))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 3724))];\n compute_shared[((((int)threadIdx.x) + 5120))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 3920))];\n compute_shared[((((int)threadIdx.x) + 5376))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 4116))];\n compute_shared[((((int)threadIdx.x) + 5632))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 4312))];\n compute_shared[((((int)threadIdx.x) + 5888))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 4508))];\n compute_shared[((((int)threadIdx.x) + 6144))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 4704))];\n compute_shared[((((int)threadIdx.x) + 6400))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 4900))];\n compute_shared[((((int)threadIdx.x) + 6656))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 5096))];\n compute_shared[((((int)threadIdx.x) + 6912))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 5292))];\n compute_shared[((((int)threadIdx.x) + 7168))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 5488))];\n compute_shared[((((int)threadIdx.x) + 7424))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 5684))];\n compute_shared[((((int)threadIdx.x) + 7680))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 5880))];\n compute_shared[((((int)threadIdx.x) + 7936))] = data[(((((((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) / 196) * 200704) + (k_outer * 6272)) + ((((((int)blockIdx.x) % 98) * 256) + ((int)threadIdx.x)) % 196)) + 6076))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 98) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 98) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 98) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 98) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24576))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[(((((((((int)blockIdx.x) / 98) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32768))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[(((((((((int)blockIdx.x) / 98) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 40960))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 98) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 49152))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[(((((((((int)blockIdx.x) / 98) * 65536) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 57344))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 256) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 96))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 128))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 160))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 192))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 224))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 256))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1280))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1792))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(2)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(4)] * compute_d_shared_local[(3)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(4)] * compute_d_shared_local[(4)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(4)] * compute_d_shared_local[(5)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(4)] * compute_d_shared_local[(6)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(4)] * compute_d_shared_local[(7)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(2)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(5)] * compute_d_shared_local[(3)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(5)] * compute_d_shared_local[(4)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(5)] * compute_d_shared_local[(5)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(5)] * compute_d_shared_local[(6)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(5)] * compute_d_shared_local[(7)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(2)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(6)] * compute_d_shared_local[(3)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(6)] * compute_d_shared_local[(4)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(6)] * compute_d_shared_local[(5)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(6)] * compute_d_shared_local[(6)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(6)] * compute_d_shared_local[(7)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(2)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(7)] * compute_d_shared_local[(3)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(7)] * compute_d_shared_local[(4)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(7)] * compute_d_shared_local[(5)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(7)] * compute_d_shared_local[(6)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(7)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)))] = max((compute_local[(0)] + bias[((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 200704))] = max((compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 200704))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 401408))] = max((compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 401408))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 602112))] = max((compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 602112))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 802816))] = max((compute_local[(32)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 802816))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1003520))] = max((compute_local[(40)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1003520))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1204224))] = max((compute_local[(48)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1204224))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1404928))] = max((compute_local[(56)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1404928))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 32))] = max((compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 32))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 200736))] = max((compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 200736))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 401440))] = max((compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 401440))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 602144))] = max((compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 602144))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 802848))] = max((compute_local[(33)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 802848))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1003552))] = max((compute_local[(41)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1003552))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1204256))] = max((compute_local[(49)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1204256))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1404960))] = max((compute_local[(57)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1404960))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 64))] = max((compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 64))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 200768))] = max((compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 200768))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 401472))] = max((compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 401472))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 602176))] = max((compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 602176))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 802880))] = max((compute_local[(34)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 802880))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1003584))] = max((compute_local[(42)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1003584))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1204288))] = max((compute_local[(50)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1204288))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1404992))] = max((compute_local[(58)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1404992))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 96))] = max((compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 96))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 200800))] = max((compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 200800))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 401504))] = max((compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 401504))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 602208))] = max((compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 602208))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 802912))] = max((compute_local[(35)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 802912))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1003616))] = max((compute_local[(43)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1003616))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1204320))] = max((compute_local[(51)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1204320))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1405024))] = max((compute_local[(59)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1405024))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 128))] = max((compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 128))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 200832))] = max((compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 200832))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 401536))] = max((compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 401536))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 602240))] = max((compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 602240))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 802944))] = max((compute_local[(36)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 802944))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1003648))] = max((compute_local[(44)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1003648))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1204352))] = max((compute_local[(52)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1204352))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1405056))] = max((compute_local[(60)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1405056))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 160))] = max((compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 160))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 200864))] = max((compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 200864))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 401568))] = max((compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 401568))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 602272))] = max((compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 602272))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 802976))] = max((compute_local[(37)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 802976))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1003680))] = max((compute_local[(45)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1003680))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1204384))] = max((compute_local[(53)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1204384))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1405088))] = max((compute_local[(61)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1405088))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 192))] = max((compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 192))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 200896))] = max((compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 200896))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 401600))] = max((compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 401600))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 602304))] = max((compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 602304))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 803008))] = max((compute_local[(38)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 803008))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1003712))] = max((compute_local[(46)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1003712))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1204416))] = max((compute_local[(54)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1204416))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1405120))] = max((compute_local[(62)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1405120))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 224))] = max((compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 224))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 200928))] = max((compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 200928))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 401632))] = max((compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 401632))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 602336))] = max((compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 602336))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 803040))] = max((compute_local[(39)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 803040))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1003744))] = max((compute_local[(47)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1003744))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1204448))] = max((compute_local[(55)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1204448))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1405152))] = max((compute_local[(63)] + bias[(((((((((int)blockIdx.x) / 98) * 1605632) + ((((int)threadIdx.x) >> 5) * 25088)) + ((((int)blockIdx.x) % 98) * 256)) + (((int)threadIdx.x) & 31)) + 1405152))]), 0.000000e+00f);\n}\n", "gridDim": [392, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,1024,14,14]_[512,1024,1,1]_[128,512,14,14].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,1024,14,14]_[512,1024,1,1]_[128,512,14,14].json new file mode 100644 index 000000000..34bd4f039 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,1024,14,14]_[512,1024,1,1]_[128,512,14,14].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 1024, 14, 14], "filter_shape": [512, 1024, 1, 1], "output_shape": [128, 512, 14, 14], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_1024_14_14___512_1024_1_1___128_512_14_14_", "code": "extern \"C\" __global__ void roller_Convolution__128_1024_14_14___512_1024_1_1___128_512_14_14_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[8];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 32; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 7) * 196)) + ((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 7) * 196)) + ((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196)) + 392))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 7) * 196)) + ((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196)) + 784))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 7) * 196)) + ((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196)) + 1176))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 7) * 196)) + ((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196)) + 1568))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 7) * 196)) + ((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196)) + 1960))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 7) * 196)) + ((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196)) + 2352))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 7) * 196)) + ((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196)) + 2744))];\n compute_shared[((((int)threadIdx.x) + 2048))] = data[((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 7) * 196)) + ((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196)) + 3136))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 7) * 196)) + ((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196)) + 3528))];\n compute_shared[((((int)threadIdx.x) + 2560))] = data[((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 7) * 196)) + ((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196)) + 3920))];\n compute_shared[((((int)threadIdx.x) + 2816))] = data[((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 7) * 196)) + ((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196)) + 4312))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 7) * 196)) + ((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196)) + 4704))];\n compute_shared[((((int)threadIdx.x) + 3328))] = data[((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 7) * 196)) + ((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196)) + 5096))];\n compute_shared[((((int)threadIdx.x) + 3584))] = data[((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 7) * 196)) + ((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196)) + 5488))];\n compute_shared[((((int)threadIdx.x) + 3840))] = data[((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 200704) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 7) * 196)) + ((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196)) + 5880))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 196) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 196) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 196) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 196) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24576))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[(((((((((int)blockIdx.x) / 196) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32768))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[(((((((((int)blockIdx.x) / 196) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 40960))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 196) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 49152))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[(((((((((int)blockIdx.x) / 196) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 57344))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[(((((((((int)blockIdx.x) / 196) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 65536))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 196) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 73728))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[(((((((((int)blockIdx.x) / 196) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 81920))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[(((((((((int)blockIdx.x) / 196) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 90112))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[(((((((((int)blockIdx.x) / 196) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 98304))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[(((((((((int)blockIdx.x) / 196) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 106496))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[(((((((((int)blockIdx.x) / 196) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 114688))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[(((((((((int)blockIdx.x) / 196) * 131072) + ((((int)threadIdx.x) >> 5) * 1024)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 122880))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 15)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 16))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 32))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 48))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 64))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 80))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 96))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 112))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2560))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3584))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(2)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(4)] * compute_d_shared_local[(3)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(4)] * compute_d_shared_local[(4)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(4)] * compute_d_shared_local[(5)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(4)] * compute_d_shared_local[(6)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(4)] * compute_d_shared_local[(7)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(2)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(5)] * compute_d_shared_local[(3)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(5)] * compute_d_shared_local[(4)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(5)] * compute_d_shared_local[(5)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(5)] * compute_d_shared_local[(6)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(5)] * compute_d_shared_local[(7)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(2)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(6)] * compute_d_shared_local[(3)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(6)] * compute_d_shared_local[(4)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(6)] * compute_d_shared_local[(5)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(6)] * compute_d_shared_local[(6)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(6)] * compute_d_shared_local[(7)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(2)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(7)] * compute_d_shared_local[(3)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(7)] * compute_d_shared_local[(4)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(7)] * compute_d_shared_local[(5)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(7)] * compute_d_shared_local[(6)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(7)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)))] = max((compute_local[(0)] + bias[((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 401408))] = max((compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 401408))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 802816))] = max((compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 802816))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1204224))] = max((compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1204224))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1605632))] = max((compute_local[(32)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1605632))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2007040))] = max((compute_local[(40)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2007040))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2408448))] = max((compute_local[(48)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2408448))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2809856))] = max((compute_local[(56)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2809856))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 16))] = max((compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 16))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 401424))] = max((compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 401424))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 802832))] = max((compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 802832))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1204240))] = max((compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1204240))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1605648))] = max((compute_local[(33)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1605648))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2007056))] = max((compute_local[(41)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2007056))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2408464))] = max((compute_local[(49)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2408464))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2809872))] = max((compute_local[(57)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2809872))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 32))] = max((compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 32))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 401440))] = max((compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 401440))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 802848))] = max((compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 802848))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1204256))] = max((compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1204256))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1605664))] = max((compute_local[(34)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1605664))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2007072))] = max((compute_local[(42)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2007072))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2408480))] = max((compute_local[(50)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2408480))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2809888))] = max((compute_local[(58)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2809888))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 48))] = max((compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 48))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 401456))] = max((compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 401456))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 802864))] = max((compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 802864))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1204272))] = max((compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1204272))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1605680))] = max((compute_local[(35)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1605680))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2007088))] = max((compute_local[(43)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2007088))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2408496))] = max((compute_local[(51)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2408496))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2809904))] = max((compute_local[(59)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2809904))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 64))] = max((compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 64))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 401472))] = max((compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 401472))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 802880))] = max((compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 802880))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1204288))] = max((compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1204288))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1605696))] = max((compute_local[(36)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1605696))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2007104))] = max((compute_local[(44)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2007104))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2408512))] = max((compute_local[(52)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2408512))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2809920))] = max((compute_local[(60)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2809920))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 80))] = max((compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 80))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 401488))] = max((compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 401488))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 802896))] = max((compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 802896))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1204304))] = max((compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1204304))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1605712))] = max((compute_local[(37)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1605712))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2007120))] = max((compute_local[(45)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2007120))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2408528))] = max((compute_local[(53)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2408528))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2809936))] = max((compute_local[(61)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2809936))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 96))] = max((compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 96))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 401504))] = max((compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 401504))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 802912))] = max((compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 802912))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1204320))] = max((compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1204320))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1605728))] = max((compute_local[(38)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1605728))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2007136))] = max((compute_local[(46)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2007136))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2408544))] = max((compute_local[(54)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2408544))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2809952))] = max((compute_local[(62)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2809952))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 112))] = max((compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 112))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 401520))] = max((compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 401520))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 802928))] = max((compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 802928))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1204336))] = max((compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1204336))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1605744))] = max((compute_local[(39)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 1605744))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2007152))] = max((compute_local[(47)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2007152))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2408560))] = max((compute_local[(55)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2408560))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2809968))] = max((compute_local[(63)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 15)) + 2809968))]), 0.000000e+00f);\n}\n", "gridDim": [784, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,128,28,28]_[128,128,3,3]_[128,128,28,28].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,128,28,28]_[128,128,3,3]_[128,128,28,28].json new file mode 100644 index 000000000..7636c1b22 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,128,28,28]_[128,128,3,3]_[128,128,28,28].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 128, 28, 28], "filter_shape": [128, 128, 3, 3], "output_shape": [128, 128, 28, 28], "window_movement_strides": [1, 1], "padding_below_diff": [1, 1], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_128_28_28___128_128_3_3___128_128_28_28_", "code": "extern \"C\" __global__ void roller_Convolution__128_128_28_28___128_128_3_3___128_128_28_28_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[8];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 36; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 115200) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) / 9) * 900)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 30)) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) % 9) / 3) * 30)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28)) + (((k_outer * 32) + (((int)threadIdx.x) >> 7)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 115200) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 2) / 9) * 900)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 30)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 2) % 9) / 3) * 30)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 115200) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 4) / 9) * 900)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 30)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 4) % 9) / 3) * 30)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 115200) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 6) / 9) * 900)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 30)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 6) % 9) / 3) * 30)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28)) + (((k_outer * 32) + (((int)threadIdx.x) >> 7)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 115200) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 8) / 9) * 900)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 30)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 8) % 9) / 3) * 30)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 115200) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 10) / 9) * 900)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 30)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 1) % 9) / 3) * 30)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 115200) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 12) / 9) * 900)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 30)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 3) % 9) / 3) * 30)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28)) + (((k_outer * 32) + (((int)threadIdx.x) >> 7)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 115200) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 14) / 9) * 900)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 30)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 5) % 9) / 3) * 30)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 2048))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 115200) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 16) / 9) * 900)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 30)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 7) % 9) / 3) * 30)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[(((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 115200) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) / 9) * 900)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 30)) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) % 9) / 3) * 30)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28)) + (((k_outer * 32) + (((int)threadIdx.x) >> 7)) % 3)) + 1800))];\n compute_shared[((((int)threadIdx.x) + 2560))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 115200) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 20) / 9) * 900)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 30)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 2) % 9) / 3) * 30)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 2816))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 115200) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 22) / 9) * 900)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 30)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 4) % 9) / 3) * 30)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 115200) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 24) / 9) * 900)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 30)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 6) % 9) / 3) * 30)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28)) + (((k_outer * 32) + (((int)threadIdx.x) >> 7)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 3328))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 115200) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 26) / 9) * 900)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 30)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 8) % 9) / 3) * 30)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 3584))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 115200) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 28) / 9) * 900)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 30)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 1) % 9) / 3) * 30)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 3840))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 115200) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 30) / 9) * 900)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 30)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 3) % 9) / 3) * 30)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28)) + (((k_outer * 32) + (((int)threadIdx.x) >> 7)) % 3)))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[(((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 9216))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 18432))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 27648))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 36864))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 46080))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 55296))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 64512))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 73728))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 82944))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 92160))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 101376))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 110592))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 119808))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 129024))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 138240))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 15)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 16))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 32))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 48))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 64))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 80))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 96))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 112))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2560))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3584))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(2)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(4)] * compute_d_shared_local[(3)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(4)] * compute_d_shared_local[(4)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(4)] * compute_d_shared_local[(5)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(4)] * compute_d_shared_local[(6)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(4)] * compute_d_shared_local[(7)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(2)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(5)] * compute_d_shared_local[(3)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(5)] * compute_d_shared_local[(4)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(5)] * compute_d_shared_local[(5)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(5)] * compute_d_shared_local[(6)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(5)] * compute_d_shared_local[(7)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(2)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(6)] * compute_d_shared_local[(3)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(6)] * compute_d_shared_local[(4)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(6)] * compute_d_shared_local[(5)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(6)] * compute_d_shared_local[(6)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(6)] * compute_d_shared_local[(7)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(2)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(7)] * compute_d_shared_local[(3)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(7)] * compute_d_shared_local[(4)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(7)] * compute_d_shared_local[(5)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(7)] * compute_d_shared_local[(6)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(7)] * compute_d_shared_local[(7)]));\n }\n }\n compute[(((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)))] = max((compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605632))] = max((compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605632))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211264))] = max((compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211264))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816896))] = max((compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816896))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422528))] = max((compute_local[(32)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422528))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028160))] = max((compute_local[(40)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028160))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633792))] = max((compute_local[(48)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633792))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239424))] = max((compute_local[(56)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239424))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 16))] = max((compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 16))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605648))] = max((compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605648))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211280))] = max((compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211280))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816912))] = max((compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816912))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422544))] = max((compute_local[(33)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422544))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028176))] = max((compute_local[(41)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028176))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633808))] = max((compute_local[(49)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633808))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239440))] = max((compute_local[(57)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239440))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32))] = max((compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605664))] = max((compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605664))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211296))] = max((compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211296))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816928))] = max((compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816928))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422560))] = max((compute_local[(34)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422560))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028192))] = max((compute_local[(42)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028192))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633824))] = max((compute_local[(50)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633824))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239456))] = max((compute_local[(58)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239456))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 48))] = max((compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 48))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605680))] = max((compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605680))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211312))] = max((compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211312))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816944))] = max((compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816944))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422576))] = max((compute_local[(35)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422576))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028208))] = max((compute_local[(43)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028208))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633840))] = max((compute_local[(51)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633840))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239472))] = max((compute_local[(59)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239472))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 64))] = max((compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 64))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605696))] = max((compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605696))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211328))] = max((compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211328))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816960))] = max((compute_local[(28)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816960))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422592))] = max((compute_local[(36)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422592))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028224))] = max((compute_local[(44)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028224))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633856))] = max((compute_local[(52)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633856))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239488))] = max((compute_local[(60)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239488))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 80))] = max((compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 80))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605712))] = max((compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605712))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211344))] = max((compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211344))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816976))] = max((compute_local[(29)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816976))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422608))] = max((compute_local[(37)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422608))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028240))] = max((compute_local[(45)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028240))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633872))] = max((compute_local[(53)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633872))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239504))] = max((compute_local[(61)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239504))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 96))] = max((compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 96))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605728))] = max((compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605728))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211360))] = max((compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211360))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816992))] = max((compute_local[(30)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816992))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422624))] = max((compute_local[(38)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422624))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028256))] = max((compute_local[(46)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028256))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633888))] = max((compute_local[(54)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633888))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239520))] = max((compute_local[(62)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239520))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 112))] = max((compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 112))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605744))] = max((compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605744))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211376))] = max((compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211376))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4817008))] = max((compute_local[(31)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4817008))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422640))] = max((compute_local[(39)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422640))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028272))] = max((compute_local[(47)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028272))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633904))] = max((compute_local[(55)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633904))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239536))] = max((compute_local[(63)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239536))]), 0.000000e+00f);\n}\n", "gridDim": [784, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,128,28,28]_[512,128,1,1]_[128,512,28,28].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,128,28,28]_[512,128,1,1]_[128,512,28,28].json new file mode 100644 index 000000000..c2e786b82 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,128,28,28]_[512,128,1,1]_[128,512,28,28].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 128, 28, 28], "filter_shape": [512, 128, 1, 1], "output_shape": [128, 512, 28, 28], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_128_28_28___512_128_1_1___128_512_28_28_", "code": "extern \"C\" __global__ void roller_Convolution__128_128_28_28___512_128_1_1___128_512_28_28_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[2048];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 4; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) / 784) * 100352) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) % 784)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[((((((((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) / 784) * 100352) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 1568))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[((((((((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) / 784) * 100352) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 3136))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) / 784) * 100352) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 4704))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[((((((((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) / 784) * 100352) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 6272))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[((((((((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) / 784) * 100352) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 7840))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) / 784) * 100352) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 9408))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[((((((((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) / 784) * 100352) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 10976))];\n compute_shared[((((int)threadIdx.x) + 2048))] = data[((((((((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) / 784) * 100352) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 12544))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[((((((((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) / 784) * 100352) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 14112))];\n compute_shared[((((int)threadIdx.x) + 2560))] = data[((((((((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) / 784) * 100352) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 15680))];\n compute_shared[((((int)threadIdx.x) + 2816))] = data[((((((((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) / 784) * 100352) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 17248))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[((((((((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) / 784) * 100352) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 18816))];\n compute_shared[((((int)threadIdx.x) + 3328))] = data[((((((((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) / 784) * 100352) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 20384))];\n compute_shared[((((int)threadIdx.x) + 3584))] = data[((((((((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) / 784) * 100352) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 21952))];\n compute_shared[((((int)threadIdx.x) + 3840))] = data[((((((((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) / 784) * 100352) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((int)blockIdx.x) % 784) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 23520))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 784) * 8192) + ((((int)threadIdx.x) >> 5) * 128)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 784) * 8192) + ((((int)threadIdx.x) >> 5) * 128)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1024))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 784) * 8192) + ((((int)threadIdx.x) >> 5) * 128)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2048))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 784) * 8192) + ((((int)threadIdx.x) >> 5) * 128)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 3072))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[(((((((((int)blockIdx.x) / 784) * 8192) + ((((int)threadIdx.x) >> 5) * 128)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4096))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[(((((((((int)blockIdx.x) / 784) * 8192) + ((((int)threadIdx.x) >> 5) * 128)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 5120))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 784) * 8192) + ((((int)threadIdx.x) >> 5) * 128)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 6144))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[(((((((((int)blockIdx.x) / 784) * 8192) + ((((int)threadIdx.x) >> 5) * 128)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 7168))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 256))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1280))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1792))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 802816))] = (compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 802816))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 1605632))] = (compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 1605632))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 2408448))] = (compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 2408448))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 3211264))] = (compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 3211264))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 4014080))] = (compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 4014080))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 4816896))] = (compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 4816896))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 5619712))] = (compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 5619712))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 32))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 802848))] = (compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 802848))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 1605664))] = (compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 1605664))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 2408480))] = (compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 2408480))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 3211296))] = (compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 3211296))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 4014112))] = (compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 4014112))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 4816928))] = (compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 4816928))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 5619744))] = (compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 5619744))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 64))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 802880))] = (compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 802880))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 1605696))] = (compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 1605696))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 2408512))] = (compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 2408512))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 3211328))] = (compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 3211328))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 4014144))] = (compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 4014144))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 4816960))] = (compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 4816960))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 5619776))] = (compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 5619776))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 96))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 802912))] = (compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 802912))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 1605728))] = (compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 1605728))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 2408544))] = (compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 2408544))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 3211360))] = (compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 3211360))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 4014176))] = (compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 4014176))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 4816992))] = (compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 4816992))]);\n compute[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 5619808))] = (compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 784) * 6422528) + ((((int)threadIdx.x) >> 5) * 100352)) + ((((int)blockIdx.x) % 784) * 128)) + (((int)threadIdx.x) & 31)) + 5619808))]);\n}\n", "gridDim": [6272, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,128,58,58]_[128,128,3,3]_[128,128,28,28].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,128,58,58]_[128,128,3,3]_[128,128,28,28].json new file mode 100644 index 000000000..413a80d6c --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,128,58,58]_[128,128,3,3]_[128,128,28,28].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 128, 58, 58], "filter_shape": [128, 128, 3, 3], "output_shape": [128, 128, 28, 28], "window_movement_strides": [2, 2], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_128_58_58___128_128_3_3___128_128_28_28_", "code": "extern \"C\" __global__ void roller_Convolution__128_128_58_58___128_128_3_3___128_128_28_28_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[8];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 36; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 430592) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) / 9) * 3364)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 116)) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) % 9) / 3) * 58)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28) * 2)) + (((k_outer * 32) + (((int)threadIdx.x) >> 7)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 430592) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 2) / 9) * 3364)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 116)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 2) % 9) / 3) * 58)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 430592) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 4) / 9) * 3364)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 116)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 4) % 9) / 3) * 58)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 430592) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 6) / 9) * 3364)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 116)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 6) % 9) / 3) * 58)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28) * 2)) + (((k_outer * 32) + (((int)threadIdx.x) >> 7)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 430592) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 8) / 9) * 3364)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 116)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 8) % 9) / 3) * 58)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 430592) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 10) / 9) * 3364)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 116)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 1) % 9) / 3) * 58)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 430592) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 12) / 9) * 3364)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 116)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 3) % 9) / 3) * 58)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28) * 2)) + (((k_outer * 32) + (((int)threadIdx.x) >> 7)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 430592) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 14) / 9) * 3364)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 116)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 5) % 9) / 3) * 58)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 2048))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 430592) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 16) / 9) * 3364)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 116)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 7) % 9) / 3) * 58)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[(((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 430592) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) / 9) * 3364)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 116)) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) % 9) / 3) * 58)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28) * 2)) + (((k_outer * 32) + (((int)threadIdx.x) >> 7)) % 3)) + 6728))];\n compute_shared[((((int)threadIdx.x) + 2560))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 430592) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 20) / 9) * 3364)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 116)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 2) % 9) / 3) * 58)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 2816))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 430592) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 22) / 9) * 3364)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 116)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 4) % 9) / 3) * 58)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 430592) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 24) / 9) * 3364)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 116)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 6) % 9) / 3) * 58)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28) * 2)) + (((k_outer * 32) + (((int)threadIdx.x) >> 7)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 3328))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 430592) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 26) / 9) * 3364)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 116)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 8) % 9) / 3) * 58)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 3584))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 430592) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 28) / 9) * 3364)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 116)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 1) % 9) / 3) * 58)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 3840))] = data[((((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 430592) + (((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 30) / 9) * 3364)) + (((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784) / 28) * 116)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 7)) + 3) % 9) / 3) * 58)) + ((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 28) * 2)) + (((k_outer * 32) + (((int)threadIdx.x) >> 7)) % 3)))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[(((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 9216))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 18432))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 27648))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 36864))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 46080))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 55296))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 64512))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 73728))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 82944))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 92160))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 101376))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 110592))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 119808))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 129024))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[((((((((int)threadIdx.x) >> 5) * 1152) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 138240))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 15)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 16))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 32))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 48))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 64))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 80))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 96))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 112))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2560))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3584))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(2)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(4)] * compute_d_shared_local[(3)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(4)] * compute_d_shared_local[(4)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(4)] * compute_d_shared_local[(5)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(4)] * compute_d_shared_local[(6)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(4)] * compute_d_shared_local[(7)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(2)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(5)] * compute_d_shared_local[(3)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(5)] * compute_d_shared_local[(4)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(5)] * compute_d_shared_local[(5)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(5)] * compute_d_shared_local[(6)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(5)] * compute_d_shared_local[(7)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(2)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(6)] * compute_d_shared_local[(3)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(6)] * compute_d_shared_local[(4)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(6)] * compute_d_shared_local[(5)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(6)] * compute_d_shared_local[(6)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(6)] * compute_d_shared_local[(7)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(2)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(7)] * compute_d_shared_local[(3)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(7)] * compute_d_shared_local[(4)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(7)] * compute_d_shared_local[(5)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(7)] * compute_d_shared_local[(6)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(7)] * compute_d_shared_local[(7)]));\n }\n }\n compute[(((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)))] = max((compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605632))] = max((compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605632))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211264))] = max((compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211264))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816896))] = max((compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816896))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422528))] = max((compute_local[(32)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422528))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028160))] = max((compute_local[(40)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028160))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633792))] = max((compute_local[(48)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633792))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239424))] = max((compute_local[(56)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239424))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 16))] = max((compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 16))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605648))] = max((compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605648))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211280))] = max((compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211280))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816912))] = max((compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816912))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422544))] = max((compute_local[(33)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422544))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028176))] = max((compute_local[(41)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028176))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633808))] = max((compute_local[(49)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633808))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239440))] = max((compute_local[(57)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239440))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32))] = max((compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605664))] = max((compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605664))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211296))] = max((compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211296))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816928))] = max((compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816928))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422560))] = max((compute_local[(34)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422560))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028192))] = max((compute_local[(42)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028192))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633824))] = max((compute_local[(50)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633824))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239456))] = max((compute_local[(58)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239456))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 48))] = max((compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 48))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605680))] = max((compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605680))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211312))] = max((compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211312))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816944))] = max((compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816944))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422576))] = max((compute_local[(35)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422576))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028208))] = max((compute_local[(43)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028208))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633840))] = max((compute_local[(51)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633840))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239472))] = max((compute_local[(59)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239472))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 64))] = max((compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 64))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605696))] = max((compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605696))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211328))] = max((compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211328))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816960))] = max((compute_local[(28)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816960))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422592))] = max((compute_local[(36)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422592))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028224))] = max((compute_local[(44)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028224))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633856))] = max((compute_local[(52)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633856))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239488))] = max((compute_local[(60)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239488))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 80))] = max((compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 80))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605712))] = max((compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605712))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211344))] = max((compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211344))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816976))] = max((compute_local[(29)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816976))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422608))] = max((compute_local[(37)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422608))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028240))] = max((compute_local[(45)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028240))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633872))] = max((compute_local[(53)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633872))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239504))] = max((compute_local[(61)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239504))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 96))] = max((compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 96))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605728))] = max((compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605728))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211360))] = max((compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211360))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816992))] = max((compute_local[(30)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816992))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422624))] = max((compute_local[(38)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422624))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028256))] = max((compute_local[(46)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028256))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633888))] = max((compute_local[(54)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633888))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239520))] = max((compute_local[(62)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239520))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 112))] = max((compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 112))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605744))] = max((compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605744))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211376))] = max((compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211376))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4817008))] = max((compute_local[(31)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4817008))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422640))] = max((compute_local[(39)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422640))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028272))] = max((compute_local[(47)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028272))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633904))] = max((compute_local[(55)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633904))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239536))] = max((compute_local[(63)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239536))]), 0.000000e+00f);\n}\n", "gridDim": [784, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,2048,7,7]_[512,2048,1,1]_[128,512,7,7].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,2048,7,7]_[512,2048,1,1]_[128,512,7,7].json new file mode 100644 index 000000000..56d1044c4 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,2048,7,7]_[512,2048,1,1]_[128,512,7,7].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 2048, 7, 7], "filter_shape": [512, 2048, 1, 1], "output_shape": [128, 512, 7, 7], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_2048_7_7___512_2048_1_1___128_512_7_7_", "code": "extern \"C\" __global__ void roller_Convolution__128_2048_7_7___512_2048_1_1___128_512_7_7_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[2048];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[8];\n float compute_d_shared_local[4];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 64; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 100352) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 6) * 49)) + ((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 100352) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 6) * 49)) + ((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49)) + 196))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 100352) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 6) * 49)) + ((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49)) + 392))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 100352) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 6) * 49)) + ((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49)) + 588))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 100352) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 6) * 49)) + ((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49)) + 784))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 100352) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 6) * 49)) + ((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49)) + 980))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 100352) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 6) * 49)) + ((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49)) + 1176))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 100352) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 6) * 49)) + ((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49)) + 1372))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32768))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 49152))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 65536))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 81920))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 98304))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 114688))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 131072))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 147456))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 163840))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 180224))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 196608))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 212992))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 229376))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[(((((((((int)blockIdx.x) / 98) * 262144) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 245760))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 64) + (((int)threadIdx.x) & 7)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 8))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 16))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 24))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 32))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 40))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 48))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 56))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer) + 3072))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(2)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(4)] * compute_d_shared_local[(3)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(2)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(5)] * compute_d_shared_local[(3)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(2)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(6)] * compute_d_shared_local[(3)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(2)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(7)] * compute_d_shared_local[(3)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)))] = max((compute_local[(0)] + bias[((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200704))] = max((compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200704))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401408))] = max((compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401408))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602112))] = max((compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602112))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 8))] = max((compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 8))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200712))] = max((compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200712))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401416))] = max((compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401416))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602120))] = max((compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602120))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 16))] = max((compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 16))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200720))] = max((compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200720))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401424))] = max((compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401424))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602128))] = max((compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602128))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 24))] = max((compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 24))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200728))] = max((compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200728))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401432))] = max((compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401432))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602136))] = max((compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602136))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 32))] = max((compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 32))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200736))] = max((compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200736))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401440))] = max((compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401440))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602144))] = max((compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602144))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 40))] = max((compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 40))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200744))] = max((compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200744))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401448))] = max((compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401448))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602152))] = max((compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602152))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 48))] = max((compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 48))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200752))] = max((compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200752))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401456))] = max((compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401456))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602160))] = max((compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602160))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 56))] = max((compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 56))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200760))] = max((compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 200760))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401464))] = max((compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 401464))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602168))] = max((compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 3) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 7)) + 602168))]), 0.000000e+00f);\n}\n", "gridDim": [392, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,14,14]_[1024,256,1,1]_[128,1024,14,14].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,14,14]_[1024,256,1,1]_[128,1024,14,14].json new file mode 100644 index 000000000..7542bac8a --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,14,14]_[1024,256,1,1]_[128,1024,14,14].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 256, 14, 14], "filter_shape": [1024, 256, 1, 1], "output_shape": [128, 1024, 14, 14], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_256_14_14___1024_256_1_1___128_1024_14_14_", "code": "extern \"C\" __global__ void roller_Convolution__128_256_14_14___1024_256_1_1___128_1024_14_14_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[2048];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 8; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 50176) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 6) * 196)) + ((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 50176) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 6) * 196)) + ((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196)) + 784))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 50176) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 6) * 196)) + ((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196)) + 1568))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 50176) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 6) * 196)) + ((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196)) + 2352))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 50176) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 6) * 196)) + ((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196)) + 3136))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 50176) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 6) * 196)) + ((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196)) + 3920))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 50176) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 6) * 196)) + ((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196)) + 4704))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 50176) + (k_outer * 6272)) + ((((int)threadIdx.x) >> 6) * 196)) + ((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196)) + 5488))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 392) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 392) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2048))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 392) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4096))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 392) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 6144))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[(((((((((int)blockIdx.x) / 392) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[(((((((((int)blockIdx.x) / 392) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 10240))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 392) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12288))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[(((((((((int)blockIdx.x) / 392) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 14336))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[(((((((((int)blockIdx.x) / 392) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 392) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 18432))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[(((((((((int)blockIdx.x) / 392) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 20480))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[(((((((((int)blockIdx.x) / 392) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 22528))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[(((((((((int)blockIdx.x) / 392) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24576))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[(((((((((int)blockIdx.x) / 392) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 26624))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[(((((((((int)blockIdx.x) / 392) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 28672))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[(((((((((int)blockIdx.x) / 392) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 30720))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 64) + (((int)threadIdx.x) & 15)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 16))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 32))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 48))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2560))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3584))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)))] = (compute_local[(0)] + bias[((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401408))] = (compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401408))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802816))] = (compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802816))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204224))] = (compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204224))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605632))] = (compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605632))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007040))] = (compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007040))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408448))] = (compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408448))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809856))] = (compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809856))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 16))] = (compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 16))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401424))] = (compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401424))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802832))] = (compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802832))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204240))] = (compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204240))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605648))] = (compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605648))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007056))] = (compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007056))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408464))] = (compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408464))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809872))] = (compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809872))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 32))] = (compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 32))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401440))] = (compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401440))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802848))] = (compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802848))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204256))] = (compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204256))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605664))] = (compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605664))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007072))] = (compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007072))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408480))] = (compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408480))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809888))] = (compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809888))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 48))] = (compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 48))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401456))] = (compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401456))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802864))] = (compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802864))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204272))] = (compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204272))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605680))] = (compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605680))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007088))] = (compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007088))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408496))] = (compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408496))]);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809904))] = (compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809904))]);\n}\n", "gridDim": [3136, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,14,14]_[256,256,3,3]_[128,256,14,14].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,14,14]_[256,256,3,3]_[128,256,14,14].json new file mode 100644 index 000000000..9cbf9277b --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,14,14]_[256,256,3,3]_[128,256,14,14].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 256, 14, 14], "filter_shape": [256, 256, 3, 3], "output_shape": [128, 256, 14, 14], "window_movement_strides": [1, 1], "padding_below_diff": [1, 1], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_256_14_14___256_256_3_3___128_256_14_14_", "code": "extern \"C\" __global__ void roller_Convolution__128_256_14_14___256_256_3_3___128_256_14_14_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[2048];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 72; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 65536) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) / 9) * 256)) + ((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196) / 14) * 16)) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) % 9) / 3) * 16)) + ((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 14)) + (((k_outer * 32) + (((int)threadIdx.x) >> 6)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[(((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 65536) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 4) / 9) * 256)) + ((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196) / 14) * 16)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 4) % 9) / 3) * 16)) + ((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 14)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[(((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 65536) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 8) / 9) * 256)) + ((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196) / 14) * 16)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 8) % 9) / 3) * 16)) + ((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 14)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 65536) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 12) / 9) * 256)) + ((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196) / 14) * 16)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 3) % 9) / 3) * 16)) + ((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 14)) + (((k_outer * 32) + (((int)threadIdx.x) >> 6)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[(((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 65536) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 16) / 9) * 256)) + ((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196) / 14) * 16)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 7) % 9) / 3) * 16)) + ((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 14)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[(((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 65536) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 20) / 9) * 256)) + ((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196) / 14) * 16)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 2) % 9) / 3) * 16)) + ((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 14)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 65536) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 24) / 9) * 256)) + ((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196) / 14) * 16)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 6) % 9) / 3) * 16)) + ((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 14)) + (((k_outer * 32) + (((int)threadIdx.x) >> 6)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[(((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 65536) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 28) / 9) * 256)) + ((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196) / 14) * 16)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 1) % 9) / 3) * 16)) + ((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 14)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 1) % 3)))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 18432))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 36864))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 55296))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 73728))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 92160))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 110592))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 129024))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 147456))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 165888))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 184320))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 202752))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 221184))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 239616))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 258048))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 276480))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 64) + (((int)threadIdx.x) & 15)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 16))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 32))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 48))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2560))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3584))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)))] = max((compute_local[(0)] + bias[((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401408))] = max((compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401408))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802816))] = max((compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802816))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204224))] = max((compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204224))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605632))] = max((compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605632))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007040))] = max((compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007040))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408448))] = max((compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408448))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809856))] = max((compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809856))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 16))] = max((compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 16))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401424))] = max((compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401424))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802832))] = max((compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802832))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204240))] = max((compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204240))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605648))] = max((compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605648))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007056))] = max((compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007056))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408464))] = max((compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408464))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809872))] = max((compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809872))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 32))] = max((compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 32))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401440))] = max((compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401440))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802848))] = max((compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802848))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204256))] = max((compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204256))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605664))] = max((compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605664))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007072))] = max((compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007072))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408480))] = max((compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408480))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809888))] = max((compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809888))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 48))] = max((compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 48))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401456))] = max((compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401456))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802864))] = max((compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802864))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204272))] = max((compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204272))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605680))] = max((compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605680))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007088))] = max((compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007088))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408496))] = max((compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408496))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809904))] = max((compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809904))]), 0.000000e+00f);\n}\n", "gridDim": [784, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,30,30]_[256,256,3,3]_[128,256,14,14].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,30,30]_[256,256,3,3]_[128,256,14,14].json new file mode 100644 index 000000000..22e5d1dd5 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,30,30]_[256,256,3,3]_[128,256,14,14].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 256, 30, 30], "filter_shape": [256, 256, 3, 3], "output_shape": [128, 256, 14, 14], "window_movement_strides": [2, 2], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_256_30_30___256_256_3_3___128_256_14_14_", "code": "extern \"C\" __global__ void roller_Convolution__128_256_30_30___256_256_3_3___128_256_14_14_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[2048];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 72; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 230400) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) / 9) * 900)) + ((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196) / 14) * 60)) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) % 9) / 3) * 30)) + (((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 14) * 2)) + (((k_outer * 32) + (((int)threadIdx.x) >> 6)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[(((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 230400) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 4) / 9) * 900)) + ((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196) / 14) * 60)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 4) % 9) / 3) * 30)) + (((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 14) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[(((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 230400) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 8) / 9) * 900)) + ((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196) / 14) * 60)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 8) % 9) / 3) * 30)) + (((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 14) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 230400) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 12) / 9) * 900)) + ((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196) / 14) * 60)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 3) % 9) / 3) * 30)) + (((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 14) * 2)) + (((k_outer * 32) + (((int)threadIdx.x) >> 6)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[(((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 230400) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 16) / 9) * 900)) + ((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196) / 14) * 60)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 7) % 9) / 3) * 30)) + (((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 14) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[(((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 230400) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 20) / 9) * 900)) + ((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196) / 14) * 60)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 2) % 9) / 3) * 30)) + (((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 14) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 230400) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 24) / 9) * 900)) + ((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196) / 14) * 60)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 6) % 9) / 3) * 30)) + (((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 14) * 2)) + (((k_outer * 32) + (((int)threadIdx.x) >> 6)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[(((((((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) / 196) * 230400) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 28) / 9) * 900)) + ((((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 196) / 14) * 60)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 1) % 9) / 3) * 30)) + (((((((int)blockIdx.x) % 392) * 64) + (((int)threadIdx.x) & 63)) % 14) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 1) % 3)))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 18432))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 36864))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 55296))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 73728))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 92160))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 110592))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 129024))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 147456))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 165888))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 184320))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 202752))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 221184))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 239616))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 258048))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[(((((((((int)blockIdx.x) / 392) * 294912) + ((((int)threadIdx.x) >> 5) * 2304)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 276480))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 64) + (((int)threadIdx.x) & 15)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 16))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 32))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 48))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2560))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3584))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)))] = max((compute_local[(0)] + bias[((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401408))] = max((compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401408))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802816))] = max((compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802816))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204224))] = max((compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204224))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605632))] = max((compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605632))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007040))] = max((compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007040))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408448))] = max((compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408448))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809856))] = max((compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809856))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 16))] = max((compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 16))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401424))] = max((compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401424))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802832))] = max((compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802832))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204240))] = max((compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204240))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605648))] = max((compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605648))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007056))] = max((compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007056))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408464))] = max((compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408464))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809872))] = max((compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809872))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 32))] = max((compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 32))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401440))] = max((compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401440))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802848))] = max((compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802848))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204256))] = max((compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204256))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605664))] = max((compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605664))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007072))] = max((compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007072))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408480))] = max((compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408480))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809888))] = max((compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809888))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 48))] = max((compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 48))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401456))] = max((compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 401456))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802864))] = max((compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 802864))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204272))] = max((compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1204272))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605680))] = max((compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 1605680))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007088))] = max((compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2007088))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408496))] = max((compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2408496))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809904))] = max((compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 392) * 3211264) + ((((int)threadIdx.x) >> 4) * 25088)) + ((((int)blockIdx.x) % 392) * 64)) + (((int)threadIdx.x) & 15)) + 2809904))]), 0.000000e+00f);\n}\n", "gridDim": [784, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,56,56]_[128,256,1,1]_[128,128,28,28].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,56,56]_[128,256,1,1]_[128,128,28,28].json new file mode 100644 index 000000000..3cf4299fc --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,56,56]_[128,256,1,1]_[128,128,28,28].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 256, 56, 56], "filter_shape": [128, 256, 1, 1], "output_shape": [128, 128, 56, 56], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_256_56_56___128_256_1_1___128_128_28_28_", "code": "extern \"C\" __global__ void roller_Convolution__128_256_56_56___128_256_1_1___128_128_28_28_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[8];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 8; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 6272))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 12544))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 18816))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 25088))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 31360))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 37632))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 43904))];\n compute_shared[((((int)threadIdx.x) + 2048))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 50176))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 56448))];\n compute_shared[((((int)threadIdx.x) + 2560))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 62720))];\n compute_shared[((((int)threadIdx.x) + 2816))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 68992))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 75264))];\n compute_shared[((((int)threadIdx.x) + 3328))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 81536))];\n compute_shared[((((int)threadIdx.x) + 3584))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 87808))];\n compute_shared[((((int)threadIdx.x) + 3840))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 94080))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[(((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2048))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4096))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 6144))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 10240))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12288))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 14336))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 18432))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 20480))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 22528))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24576))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 26624))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 28672))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 30720))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 15)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 16))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 32))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 48))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 64))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 80))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 96))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 112))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2560))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3584))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(2)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(4)] * compute_d_shared_local[(3)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(4)] * compute_d_shared_local[(4)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(4)] * compute_d_shared_local[(5)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(4)] * compute_d_shared_local[(6)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(4)] * compute_d_shared_local[(7)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(2)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(5)] * compute_d_shared_local[(3)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(5)] * compute_d_shared_local[(4)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(5)] * compute_d_shared_local[(5)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(5)] * compute_d_shared_local[(6)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(5)] * compute_d_shared_local[(7)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(2)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(6)] * compute_d_shared_local[(3)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(6)] * compute_d_shared_local[(4)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(6)] * compute_d_shared_local[(5)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(6)] * compute_d_shared_local[(6)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(6)] * compute_d_shared_local[(7)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(2)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(7)] * compute_d_shared_local[(3)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(7)] * compute_d_shared_local[(4)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(7)] * compute_d_shared_local[(5)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(7)] * compute_d_shared_local[(6)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(7)] * compute_d_shared_local[(7)]));\n }\n }\n compute[(((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)))] = max((compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422528))] = max((compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422528))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 12845056))] = max((compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 12845056))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 19267584))] = max((compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 19267584))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 25690112))] = max((compute_local[(32)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 25690112))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32112640))] = max((compute_local[(40)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32112640))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 38535168))] = max((compute_local[(48)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 38535168))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 44957696))] = max((compute_local[(56)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 44957696))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 16))] = max((compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 16))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422544))] = max((compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422544))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 12845072))] = max((compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 12845072))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 19267600))] = max((compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 19267600))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 25690128))] = max((compute_local[(33)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 25690128))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32112656))] = max((compute_local[(41)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32112656))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 38535184))] = max((compute_local[(49)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 38535184))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 44957712))] = max((compute_local[(57)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 44957712))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32))] = max((compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422560))] = max((compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422560))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 12845088))] = max((compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 12845088))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 19267616))] = max((compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 19267616))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 25690144))] = max((compute_local[(34)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 25690144))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32112672))] = max((compute_local[(42)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32112672))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 38535200))] = max((compute_local[(50)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 38535200))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 44957728))] = max((compute_local[(58)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 44957728))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 48))] = max((compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 48))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422576))] = max((compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422576))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 12845104))] = max((compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 12845104))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 19267632))] = max((compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 19267632))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 25690160))] = max((compute_local[(35)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 25690160))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32112688))] = max((compute_local[(43)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32112688))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 38535216))] = max((compute_local[(51)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 38535216))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 44957744))] = max((compute_local[(59)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 44957744))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 64))] = max((compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 64))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422592))] = max((compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422592))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 12845120))] = max((compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 12845120))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 19267648))] = max((compute_local[(28)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 19267648))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 25690176))] = max((compute_local[(36)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 25690176))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32112704))] = max((compute_local[(44)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32112704))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 38535232))] = max((compute_local[(52)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 38535232))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 44957760))] = max((compute_local[(60)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 44957760))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 80))] = max((compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 80))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422608))] = max((compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422608))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 12845136))] = max((compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 12845136))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 19267664))] = max((compute_local[(29)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 19267664))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 25690192))] = max((compute_local[(37)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 25690192))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32112720))] = max((compute_local[(45)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32112720))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 38535248))] = max((compute_local[(53)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 38535248))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 44957776))] = max((compute_local[(61)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 44957776))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 96))] = max((compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 96))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422624))] = max((compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422624))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 12845152))] = max((compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 12845152))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 19267680))] = max((compute_local[(30)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 19267680))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 25690208))] = max((compute_local[(38)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 25690208))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32112736))] = max((compute_local[(46)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32112736))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 38535264))] = max((compute_local[(54)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 38535264))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 44957792))] = max((compute_local[(62)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 44957792))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 112))] = max((compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 112))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422640))] = max((compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422640))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 12845168))] = max((compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 12845168))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 19267696))] = max((compute_local[(31)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 19267696))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 25690224))] = max((compute_local[(39)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 25690224))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32112752))] = max((compute_local[(47)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32112752))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 38535280))] = max((compute_local[(55)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 38535280))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 44957808))] = max((compute_local[(63)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 44957808))]), 0.000000e+00f);\n}\n", "gridDim": [3136, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,56,56]_[512,256,1,1]_[128,512,28,28].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,56,56]_[512,256,1,1]_[128,512,28,28].json new file mode 100644 index 000000000..659dcec5b --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,56,56]_[512,256,1,1]_[128,512,28,28].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 256, 56, 56], "filter_shape": [512, 256, 1, 1], "output_shape": [128, 512, 28, 28], "window_movement_strides": [2, 2], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_256_56_56___512_256_1_1___128_512_28_28_", "code": "extern \"C\" __global__ void roller_Convolution__128_256_56_56___512_256_1_1___128_512_28_28_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[2048];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[8];\n float compute_d_shared_local[4];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 8; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) / 784) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 6) * 3136)) + ((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 784) / 28) * 112)) + (((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 28) * 2)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[(((((((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) / 784) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 6) * 3136)) + ((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 784) / 28) * 112)) + (((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 28) * 2)) + 12544))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[(((((((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) / 784) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 6) * 3136)) + ((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 784) / 28) * 112)) + (((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 28) * 2)) + 25088))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) / 784) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 6) * 3136)) + ((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 784) / 28) * 112)) + (((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 28) * 2)) + 37632))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[(((((((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) / 784) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 6) * 3136)) + ((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 784) / 28) * 112)) + (((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 28) * 2)) + 50176))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[(((((((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) / 784) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 6) * 3136)) + ((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 784) / 28) * 112)) + (((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 28) * 2)) + 62720))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) / 784) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 6) * 3136)) + ((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 784) / 28) * 112)) + (((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 28) * 2)) + 75264))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[(((((((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) / 784) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 6) * 3136)) + ((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 784) / 28) * 112)) + (((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 28) * 2)) + 87808))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 1568) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 1568) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2048))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 1568) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4096))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 1568) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 6144))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[(((((((((int)blockIdx.x) / 1568) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[(((((((((int)blockIdx.x) / 1568) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 10240))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 1568) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12288))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[(((((((((int)blockIdx.x) / 1568) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 14336))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[(((((((((int)blockIdx.x) / 1568) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 1568) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 18432))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[(((((((((int)blockIdx.x) / 1568) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 20480))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[(((((((((int)blockIdx.x) / 1568) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 22528))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[(((((((((int)blockIdx.x) / 1568) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24576))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[(((((((((int)blockIdx.x) / 1568) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 26624))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[(((((((((int)blockIdx.x) / 1568) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 28672))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[(((((((((int)blockIdx.x) / 1568) * 32768) + ((((int)threadIdx.x) >> 5) * 256)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 30720))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 64) + (((int)threadIdx.x) & 7)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 8))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 16))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 24))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 32))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 40))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 48))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 7)) + 56))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer) + 3072))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(2)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(4)] * compute_d_shared_local[(3)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(2)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(5)] * compute_d_shared_local[(3)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(2)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(6)] * compute_d_shared_local[(3)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(2)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(7)] * compute_d_shared_local[(3)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)))] = (compute_local[(0)] + bias[((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 3211264))] = (compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 3211264))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 6422528))] = (compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 6422528))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 9633792))] = (compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 9633792))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 8))] = (compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 8))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 3211272))] = (compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 3211272))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 6422536))] = (compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 6422536))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 9633800))] = (compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 9633800))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 16))] = (compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 16))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 3211280))] = (compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 3211280))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 6422544))] = (compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 6422544))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 9633808))] = (compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 9633808))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 24))] = (compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 24))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 3211288))] = (compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 3211288))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 6422552))] = (compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 6422552))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 9633816))] = (compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 9633816))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 32))] = (compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 32))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 3211296))] = (compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 3211296))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 6422560))] = (compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 6422560))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 9633824))] = (compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 9633824))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 40))] = (compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 40))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 3211304))] = (compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 3211304))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 6422568))] = (compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 6422568))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 9633832))] = (compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 9633832))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 48))] = (compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 48))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 3211312))] = (compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 3211312))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 6422576))] = (compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 6422576))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 9633840))] = (compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 9633840))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 56))] = (compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 56))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 3211320))] = (compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 3211320))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 6422584))] = (compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 6422584))]);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 9633848))] = (compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 3) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 7)) + 9633848))]);\n}\n", "gridDim": [6272, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,56,56]_[64,256,1,1]_[128,64,56,56].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,56,56]_[64,256,1,1]_[128,64,56,56].json new file mode 100644 index 000000000..43b9329eb --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,256,56,56]_[64,256,1,1]_[128,64,56,56].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 256, 56, 56], "filter_shape": [64, 256, 1, 1], "output_shape": [128, 64, 56, 56], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_256_56_56___64_256_1_1___128_64_56_56_", "code": "extern \"C\" __global__ void roller_Convolution__128_256_56_56___64_256_1_1___128_64_56_56_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[2048];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 8; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 6272))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 12544))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 18816))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 25088))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 31360))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 37632))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 43904))];\n compute_shared[((((int)threadIdx.x) + 2048))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 50176))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 56448))];\n compute_shared[((((int)threadIdx.x) + 2560))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 62720))];\n compute_shared[((((int)threadIdx.x) + 2816))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 68992))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 75264))];\n compute_shared[((((int)threadIdx.x) + 3328))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 81536))];\n compute_shared[((((int)threadIdx.x) + 3584))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 87808))];\n compute_shared[((((int)threadIdx.x) + 3840))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 802816) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 94080))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[(((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2048))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4096))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 6144))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 10240))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12288))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[((((((((int)threadIdx.x) >> 5) * 256) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 14336))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 256))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1280))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1792))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n compute[(((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))] = max((compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 3211264))] = max((compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 3211264))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 6422528))] = max((compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 6422528))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 9633792))] = max((compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 9633792))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 12845056))] = max((compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 12845056))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16056320))] = max((compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16056320))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 19267584))] = max((compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 19267584))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 22478848))] = max((compute_local[(28)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 22478848))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = max((compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 32))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 3211296))] = max((compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 3211296))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 6422560))] = max((compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 6422560))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 9633824))] = max((compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 9633824))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 12845088))] = max((compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 12845088))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16056352))] = max((compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16056352))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 19267616))] = max((compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 19267616))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 22478880))] = max((compute_local[(29)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 22478880))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = max((compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 64))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 3211328))] = max((compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 3211328))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 6422592))] = max((compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 6422592))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 9633856))] = max((compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 9633856))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 12845120))] = max((compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 12845120))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16056384))] = max((compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16056384))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 19267648))] = max((compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 19267648))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 22478912))] = max((compute_local[(30)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 22478912))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = max((compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 96))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 3211360))] = max((compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 3211360))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 6422624))] = max((compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 6422624))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 9633888))] = max((compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 9633888))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 12845152))] = max((compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 12845152))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16056416))] = max((compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 16056416))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 19267680))] = max((compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 19267680))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 22478944))] = max((compute_local[(31)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 31)) + 22478944))]), 0.000000e+00f);\n}\n", "gridDim": [3136, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,3,230,230]_[64,3,7,7]_[128,64,112,112].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,3,230,230]_[64,3,7,7]_[128,64,112,112].json new file mode 100644 index 000000000..8f15ef3e0 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,3,230,230]_[64,3,7,7]_[128,64,112,112].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 3, 230, 230], "filter_shape": [64, 3, 7, 7], "output_shape": [128, 64, 112, 112], "window_movement_strides": [2, 2], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_3_230_230___64_3_7_7___128_64_112_112_", "code": "extern \"C\" __global__ void roller_Convolution__128_3_230_230___64_3_7_7___128_64_112_112_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[8192];\n __shared__ float compute_d_shared[2048];\n float compute_shared_local[8];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 5; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + (((k_outer * 32) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + ((((k_outer * 32) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + ((k_outer * 32) % 7)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 1) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 1) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 1) % 7)))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 2) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 2) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 2) % 7)))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 3) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 3) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 3) % 7)))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 4) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 4) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 4) % 7)))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 5) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 5) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 5) % 7)))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 6) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 6) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 6) % 7)))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 7) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 7) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + ((k_outer * 32) % 7)))];\n compute_shared[((((int)threadIdx.x) + 2048))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 8) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 8) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 1) % 7)))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 9) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 9) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 2) % 7)))];\n compute_shared[((((int)threadIdx.x) + 2560))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 10) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 10) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 3) % 7)))];\n compute_shared[((((int)threadIdx.x) + 2816))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 11) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 11) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 4) % 7)))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 12) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 12) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 5) % 7)))];\n compute_shared[((((int)threadIdx.x) + 3328))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 13) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 13) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 6) % 7)))];\n compute_shared[((((int)threadIdx.x) + 3584))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 14) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 14) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + ((k_outer * 32) % 7)))];\n compute_shared[((((int)threadIdx.x) + 3840))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 15) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 15) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 1) % 7)))];\n compute_shared[((((int)threadIdx.x) + 4096))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 16) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 16) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 2) % 7)))];\n compute_shared[((((int)threadIdx.x) + 4352))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 17) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 17) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 3) % 7)))];\n compute_shared[((((int)threadIdx.x) + 4608))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 18) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 18) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 4) % 7)))];\n compute_shared[((((int)threadIdx.x) + 4864))] = ((k_outer < 4) ? data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 19) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 19) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 5) % 7)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 5120))] = ((k_outer < 4) ? data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 20) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 20) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 6) % 7)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 5376))] = ((k_outer < 4) ? data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 21) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 21) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + ((k_outer * 32) % 7)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 5632))] = ((k_outer < 4) ? data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 22) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 22) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 1) % 7)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 5888))] = ((k_outer < 4) ? data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 23) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 23) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 2) % 7)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 6144))] = ((k_outer < 4) ? data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 24) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 24) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 3) % 7)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 6400))] = ((k_outer < 4) ? data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 25) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 25) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 4) % 7)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 6656))] = ((k_outer < 4) ? data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 26) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 26) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 5) % 7)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 6912))] = ((k_outer < 4) ? data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 27) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 27) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 6) % 7)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 7168))] = ((k_outer < 4) ? data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 28) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 28) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + ((k_outer * 32) % 7)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 7424))] = ((k_outer < 4) ? data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 29) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 29) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 1) % 7)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 7680))] = ((k_outer < 4) ? data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 30) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 30) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 2) % 7)))] : 0.000000e+00f);\n compute_shared[((((int)threadIdx.x) + 7936))] = ((k_outer < 4) ? data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 12544) * 158700) + ((((k_outer * 32) + 31) / 49) * 52900)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 12544) / 112) * 460)) + (((((k_outer * 32) + 31) % 49) / 7) * 230)) + ((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 112) * 2)) + (((k_outer * 32) + 3) % 7)))] : 0.000000e+00f);\n compute_d_shared[(((int)threadIdx.x))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 147) ? kernel[(((((((int)threadIdx.x) >> 5) * 147) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 256))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 147) ? kernel[((((((((int)threadIdx.x) >> 5) * 147) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1176))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 512))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 147) ? kernel[((((((((int)threadIdx.x) >> 5) * 147) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2352))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 768))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 147) ? kernel[((((((((int)threadIdx.x) >> 5) * 147) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 3528))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1024))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 147) ? kernel[((((((((int)threadIdx.x) >> 5) * 147) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4704))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1280))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 147) ? kernel[((((((((int)threadIdx.x) >> 5) * 147) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 5880))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1536))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 147) ? kernel[((((((((int)threadIdx.x) >> 5) * 147) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 7056))] : 0.000000e+00f);\n compute_d_shared[((((int)threadIdx.x) + 1792))] = ((((k_outer * 32) + (((int)threadIdx.x) & 31)) < 147) ? kernel[((((((((int)threadIdx.x) >> 5) * 147) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8232))] : 0.000000e+00f);\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 256) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 96))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 128))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 160))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 192))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 224))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 256))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1280))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1792))];\n if (((k_outer * 32) + k_inner_outer) < 147) {\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(2)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(4)] * compute_d_shared_local[(3)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(4)] * compute_d_shared_local[(4)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(4)] * compute_d_shared_local[(5)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(4)] * compute_d_shared_local[(6)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(4)] * compute_d_shared_local[(7)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(2)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(5)] * compute_d_shared_local[(3)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(5)] * compute_d_shared_local[(4)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(5)] * compute_d_shared_local[(5)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(5)] * compute_d_shared_local[(6)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(5)] * compute_d_shared_local[(7)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(2)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(6)] * compute_d_shared_local[(3)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(6)] * compute_d_shared_local[(4)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(6)] * compute_d_shared_local[(5)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(6)] * compute_d_shared_local[(6)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(6)] * compute_d_shared_local[(7)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(2)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(7)] * compute_d_shared_local[(3)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(7)] * compute_d_shared_local[(4)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(7)] * compute_d_shared_local[(5)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(7)] * compute_d_shared_local[(6)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(7)] * compute_d_shared_local[(7)]));\n }\n }\n }\n compute[(((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)))] = max((compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845056))] = max((compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845056))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 25690112))] = max((compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 25690112))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 38535168))] = max((compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 38535168))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 51380224))] = max((compute_local[(32)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 51380224))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64225280))] = max((compute_local[(40)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64225280))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 77070336))] = max((compute_local[(48)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 77070336))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 89915392))] = max((compute_local[(56)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 89915392))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 32))] = max((compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 32))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845088))] = max((compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845088))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 25690144))] = max((compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 25690144))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 38535200))] = max((compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 38535200))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 51380256))] = max((compute_local[(33)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 51380256))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64225312))] = max((compute_local[(41)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64225312))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 77070368))] = max((compute_local[(49)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 77070368))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 89915424))] = max((compute_local[(57)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 89915424))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64))] = max((compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845120))] = max((compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845120))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 25690176))] = max((compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 25690176))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 38535232))] = max((compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 38535232))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 51380288))] = max((compute_local[(34)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 51380288))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64225344))] = max((compute_local[(42)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64225344))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 77070400))] = max((compute_local[(50)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 77070400))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 89915456))] = max((compute_local[(58)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 89915456))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 96))] = max((compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 96))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845152))] = max((compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845152))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 25690208))] = max((compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 25690208))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 38535264))] = max((compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 38535264))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 51380320))] = max((compute_local[(35)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 51380320))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64225376))] = max((compute_local[(43)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64225376))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 77070432))] = max((compute_local[(51)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 77070432))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 89915488))] = max((compute_local[(59)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 89915488))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 128))] = max((compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 128))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845184))] = max((compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845184))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 25690240))] = max((compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 25690240))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 38535296))] = max((compute_local[(28)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 38535296))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 51380352))] = max((compute_local[(36)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 51380352))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64225408))] = max((compute_local[(44)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64225408))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 77070464))] = max((compute_local[(52)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 77070464))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 89915520))] = max((compute_local[(60)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 89915520))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 160))] = max((compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 160))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845216))] = max((compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845216))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 25690272))] = max((compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 25690272))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 38535328))] = max((compute_local[(29)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 38535328))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 51380384))] = max((compute_local[(37)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 51380384))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64225440))] = max((compute_local[(45)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64225440))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 77070496))] = max((compute_local[(53)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 77070496))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 89915552))] = max((compute_local[(61)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 89915552))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 192))] = max((compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 192))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845248))] = max((compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845248))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 25690304))] = max((compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 25690304))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 38535360))] = max((compute_local[(30)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 38535360))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 51380416))] = max((compute_local[(38)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 51380416))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64225472))] = max((compute_local[(46)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64225472))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 77070528))] = max((compute_local[(54)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 77070528))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 89915584))] = max((compute_local[(62)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 89915584))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 224))] = max((compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 224))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845280))] = max((compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845280))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 25690336))] = max((compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 25690336))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 38535392))] = max((compute_local[(31)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 38535392))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 51380448))] = max((compute_local[(39)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 51380448))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64225504))] = max((compute_local[(47)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64225504))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 77070560))] = max((compute_local[(55)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 77070560))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 89915616))] = max((compute_local[(63)] + bias[((((((((int)threadIdx.x) >> 5) * 1605632) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 89915616))]), 0.000000e+00f);\n}\n", "gridDim": [6272, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,16,16]_[512,512,3,3]_[128,512,7,7].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,16,16]_[512,512,3,3]_[128,512,7,7].json new file mode 100644 index 000000000..e7907f2b4 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,16,16]_[512,512,3,3]_[128,512,7,7].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 512, 16, 16], "filter_shape": [512, 512, 3, 3], "output_shape": [128, 512, 7, 7], "window_movement_strides": [2, 2], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_512_16_16___512_512_3_3___128_512_7_7_", "code": "extern \"C\" __global__ void roller_Convolution__128_512_16_16___512_512_3_3___128_512_7_7_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[2048];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 144; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 131072) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) / 9) * 256)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 32)) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) % 9) / 3) * 16)) + (((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7) * 2)) + (((k_outer * 32) + (((int)threadIdx.x) >> 6)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 131072) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 4) / 9) * 256)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 32)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 4) % 9) / 3) * 16)) + (((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 131072) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 8) / 9) * 256)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 32)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 8) % 9) / 3) * 16)) + (((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 131072) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 12) / 9) * 256)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 32)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 3) % 9) / 3) * 16)) + (((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7) * 2)) + (((k_outer * 32) + (((int)threadIdx.x) >> 6)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 131072) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 16) / 9) * 256)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 32)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 7) % 9) / 3) * 16)) + (((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 131072) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 20) / 9) * 256)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 32)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 2) % 9) / 3) * 16)) + (((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 131072) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 24) / 9) * 256)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 32)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 6) % 9) / 3) * 16)) + (((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7) * 2)) + (((k_outer * 32) + (((int)threadIdx.x) >> 6)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 131072) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 28) / 9) * 256)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 32)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 1) % 9) / 3) * 16)) + (((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7) * 2)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 1) % 3)))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 36864))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 73728))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 110592))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 147456))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 184320))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 221184))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 258048))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 294912))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 331776))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 368640))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 405504))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 442368))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 479232))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 516096))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 552960))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 64) + (((int)threadIdx.x) & 15)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 16))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 32))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 48))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2560))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3584))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)))] = max((compute_local[(0)] + bias[((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 100352))] = max((compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 100352))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 200704))] = max((compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 200704))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 301056))] = max((compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 301056))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 401408))] = max((compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 401408))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 501760))] = max((compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 501760))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 602112))] = max((compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 602112))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 702464))] = max((compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 702464))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 16))] = max((compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 16))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 100368))] = max((compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 100368))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 200720))] = max((compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 200720))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 301072))] = max((compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 301072))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 401424))] = max((compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 401424))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 501776))] = max((compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 501776))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 602128))] = max((compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 602128))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 702480))] = max((compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 702480))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 32))] = max((compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 32))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 100384))] = max((compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 100384))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 200736))] = max((compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 200736))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 301088))] = max((compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 301088))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 401440))] = max((compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 401440))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 501792))] = max((compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 501792))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 602144))] = max((compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 602144))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 702496))] = max((compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 702496))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 48))] = max((compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 48))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 100400))] = max((compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 100400))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 200752))] = max((compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 200752))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 301104))] = max((compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 301104))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 401456))] = max((compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 401456))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 501808))] = max((compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 501808))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 602160))] = max((compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 602160))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 702512))] = max((compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 702512))]), 0.000000e+00f);\n}\n", "gridDim": [392, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,28,28]_[1024,512,1,1]_[128,1024,14,14].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,28,28]_[1024,512,1,1]_[128,1024,14,14].json new file mode 100644 index 000000000..fea082775 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,28,28]_[1024,512,1,1]_[128,1024,14,14].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 512, 28, 28], "filter_shape": [1024, 512, 1, 1], "output_shape": [128, 1024, 14, 14], "window_movement_strides": [2, 2], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_512_28_28___1024_512_1_1___128_1024_14_14_", "code": "extern \"C\" __global__ void roller_Convolution__128_512_28_28___1024_512_1_1___128_1024_14_14_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[16];\n float compute_d_shared_local[4];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 16; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196) / 14) * 56)) + (((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 14) * 2)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[(((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196) / 14) * 56)) + (((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 14) * 2)) + 1568))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[(((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196) / 14) * 56)) + (((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 14) * 2)) + 3136))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196) / 14) * 56)) + (((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 14) * 2)) + 4704))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[(((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196) / 14) * 56)) + (((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 14) * 2)) + 6272))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[(((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196) / 14) * 56)) + (((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 14) * 2)) + 7840))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196) / 14) * 56)) + (((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 14) * 2)) + 9408))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[(((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196) / 14) * 56)) + (((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 14) * 2)) + 10976))];\n compute_shared[((((int)threadIdx.x) + 2048))] = data[(((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196) / 14) * 56)) + (((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 14) * 2)) + 12544))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[(((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196) / 14) * 56)) + (((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 14) * 2)) + 14112))];\n compute_shared[((((int)threadIdx.x) + 2560))] = data[(((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196) / 14) * 56)) + (((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 14) * 2)) + 15680))];\n compute_shared[((((int)threadIdx.x) + 2816))] = data[(((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196) / 14) * 56)) + (((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 14) * 2)) + 17248))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[(((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196) / 14) * 56)) + (((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 14) * 2)) + 18816))];\n compute_shared[((((int)threadIdx.x) + 3328))] = data[(((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196) / 14) * 56)) + (((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 14) * 2)) + 20384))];\n compute_shared[((((int)threadIdx.x) + 3584))] = data[(((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196) / 14) * 56)) + (((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 14) * 2)) + 21952))];\n compute_shared[((((int)threadIdx.x) + 3840))] = data[(((((((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) / 196) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + ((((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 196) / 14) * 56)) + (((((((int)blockIdx.x) % 196) * 128) + (((int)threadIdx.x) & 127)) % 14) * 2)) + 23520))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 196) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 196) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4096))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 196) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 196) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12288))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[(((((((((int)blockIdx.x) / 196) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[(((((((((int)blockIdx.x) / 196) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 20480))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 196) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24576))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[(((((((((int)blockIdx.x) / 196) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 28672))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[(((((((((int)blockIdx.x) / 196) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32768))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 196) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 36864))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[(((((((((int)blockIdx.x) / 196) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 40960))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[(((((((((int)blockIdx.x) / 196) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 45056))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[(((((((((int)blockIdx.x) / 196) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 49152))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[(((((((((int)blockIdx.x) / 196) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 53248))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[(((((((((int)blockIdx.x) / 196) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 57344))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[(((((((((int)blockIdx.x) / 196) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 61440))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 7)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 7)) + 8))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 7)) + 16))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 7)) + 24))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 7)) + 32))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 7)) + 40))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 7)) + 48))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 7)) + 56))];\n compute_shared_local[(8)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 7)) + 64))];\n compute_shared_local[(9)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 7)) + 72))];\n compute_shared_local[(10)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 7)) + 80))];\n compute_shared_local[(11)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 7)) + 88))];\n compute_shared_local[(12)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 7)) + 96))];\n compute_shared_local[(13)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 7)) + 104))];\n compute_shared_local[(14)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 7)) + 112))];\n compute_shared_local[(15)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 7)) + 120))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 3) * 32) + k_inner_outer) + 3072))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(4)] * compute_d_shared_local[(2)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(4)] * compute_d_shared_local[(3)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(5)] * compute_d_shared_local[(2)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(5)] * compute_d_shared_local[(3)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(6)] * compute_d_shared_local[(2)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(6)] * compute_d_shared_local[(3)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(7)] * compute_d_shared_local[(2)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(7)] * compute_d_shared_local[(3)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(8)] * compute_d_shared_local[(0)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(8)] * compute_d_shared_local[(1)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(8)] * compute_d_shared_local[(2)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(8)] * compute_d_shared_local[(3)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(9)] * compute_d_shared_local[(0)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(9)] * compute_d_shared_local[(1)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(9)] * compute_d_shared_local[(2)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(9)] * compute_d_shared_local[(3)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(10)] * compute_d_shared_local[(0)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(10)] * compute_d_shared_local[(1)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(10)] * compute_d_shared_local[(2)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(10)] * compute_d_shared_local[(3)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(11)] * compute_d_shared_local[(0)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(11)] * compute_d_shared_local[(1)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(11)] * compute_d_shared_local[(2)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(11)] * compute_d_shared_local[(3)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(12)] * compute_d_shared_local[(0)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(12)] * compute_d_shared_local[(1)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(12)] * compute_d_shared_local[(2)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(12)] * compute_d_shared_local[(3)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(13)] * compute_d_shared_local[(0)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(13)] * compute_d_shared_local[(1)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(13)] * compute_d_shared_local[(2)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(13)] * compute_d_shared_local[(3)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(14)] * compute_d_shared_local[(0)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(14)] * compute_d_shared_local[(1)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(14)] * compute_d_shared_local[(2)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(14)] * compute_d_shared_local[(3)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(15)] * compute_d_shared_local[(0)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(15)] * compute_d_shared_local[(1)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(15)] * compute_d_shared_local[(2)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(15)] * compute_d_shared_local[(3)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)))] = (compute_local[(0)] + bias[((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802816))] = (compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802816))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605632))] = (compute_local[(32)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605632))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408448))] = (compute_local[(48)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408448))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 8))] = (compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 8))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802824))] = (compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802824))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605640))] = (compute_local[(33)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605640))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408456))] = (compute_local[(49)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408456))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 16))] = (compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 16))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802832))] = (compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802832))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605648))] = (compute_local[(34)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605648))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408464))] = (compute_local[(50)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408464))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 24))] = (compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 24))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802840))] = (compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802840))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605656))] = (compute_local[(35)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605656))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408472))] = (compute_local[(51)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408472))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 32))] = (compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 32))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802848))] = (compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802848))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605664))] = (compute_local[(36)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605664))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408480))] = (compute_local[(52)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408480))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 40))] = (compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 40))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802856))] = (compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802856))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605672))] = (compute_local[(37)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605672))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408488))] = (compute_local[(53)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408488))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 48))] = (compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 48))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802864))] = (compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802864))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605680))] = (compute_local[(38)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605680))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408496))] = (compute_local[(54)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408496))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 56))] = (compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 56))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802872))] = (compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802872))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605688))] = (compute_local[(39)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605688))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408504))] = (compute_local[(55)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408504))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 64))] = (compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 64))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802880))] = (compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802880))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605696))] = (compute_local[(40)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605696))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408512))] = (compute_local[(56)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408512))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 72))] = (compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 72))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802888))] = (compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802888))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605704))] = (compute_local[(41)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605704))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408520))] = (compute_local[(57)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408520))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 80))] = (compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 80))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802896))] = (compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802896))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605712))] = (compute_local[(42)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605712))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408528))] = (compute_local[(58)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408528))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 88))] = (compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 88))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802904))] = (compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802904))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605720))] = (compute_local[(43)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605720))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408536))] = (compute_local[(59)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408536))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 96))] = (compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 96))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802912))] = (compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802912))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605728))] = (compute_local[(44)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605728))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408544))] = (compute_local[(60)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408544))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 104))] = (compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 104))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802920))] = (compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802920))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605736))] = (compute_local[(45)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605736))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408552))] = (compute_local[(61)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408552))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 112))] = (compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 112))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802928))] = (compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802928))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605744))] = (compute_local[(46)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605744))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408560))] = (compute_local[(62)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408560))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 120))] = (compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 120))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802936))] = (compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 802936))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605752))] = (compute_local[(47)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 1605752))]);\n compute[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408568))] = (compute_local[(63)] + bias[(((((((((int)blockIdx.x) / 196) * 3211264) + ((((int)threadIdx.x) >> 3) * 25088)) + ((((int)blockIdx.x) % 196) * 128)) + (((int)threadIdx.x) & 7)) + 2408568))]);\n}\n", "gridDim": [1568, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,28,28]_[128,512,1,1]_[128,128,28,28].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,28,28]_[128,512,1,1]_[128,128,28,28].json new file mode 100644 index 000000000..0b991cc7a --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,28,28]_[128,512,1,1]_[128,128,28,28].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 512, 28, 28], "filter_shape": [128, 512, 1, 1], "output_shape": [128, 128, 28, 28], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_512_28_28___128_512_1_1___128_128_28_28_", "code": "extern \"C\" __global__ void roller_Convolution__128_512_28_28___128_512_1_1___128_128_28_28_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[8];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 16; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 1568))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 3136))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 4704))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 6272))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 7840))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 9408))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 10976))];\n compute_shared[((((int)threadIdx.x) + 2048))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 12544))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 14112))];\n compute_shared[((((int)threadIdx.x) + 2560))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 15680))];\n compute_shared[((((int)threadIdx.x) + 2816))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 17248))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 18816))];\n compute_shared[((((int)threadIdx.x) + 3328))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 20384))];\n compute_shared[((((int)threadIdx.x) + 3584))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 21952))];\n compute_shared[((((int)threadIdx.x) + 3840))] = data[(((((((((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 7) * 784)) + (((((int)blockIdx.x) * 128) + (((int)threadIdx.x) & 127)) % 784)) + 23520))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[(((((((int)threadIdx.x) >> 5) * 512) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[((((((((int)threadIdx.x) >> 5) * 512) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4096))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[((((((((int)threadIdx.x) >> 5) * 512) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[((((((((int)threadIdx.x) >> 5) * 512) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12288))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[((((((((int)threadIdx.x) >> 5) * 512) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[((((((((int)threadIdx.x) >> 5) * 512) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 20480))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[((((((((int)threadIdx.x) >> 5) * 512) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24576))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[((((((((int)threadIdx.x) >> 5) * 512) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 28672))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[((((((((int)threadIdx.x) >> 5) * 512) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32768))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[((((((((int)threadIdx.x) >> 5) * 512) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 36864))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[((((((((int)threadIdx.x) >> 5) * 512) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 40960))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[((((((((int)threadIdx.x) >> 5) * 512) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 45056))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[((((((((int)threadIdx.x) >> 5) * 512) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 49152))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[((((((((int)threadIdx.x) >> 5) * 512) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 53248))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[((((((((int)threadIdx.x) >> 5) * 512) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 57344))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[((((((((int)threadIdx.x) >> 5) * 512) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 61440))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 15)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 16))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 32))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 48))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 64))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 80))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 96))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 112))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2560))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3584))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(2)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(4)] * compute_d_shared_local[(3)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(4)] * compute_d_shared_local[(4)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(4)] * compute_d_shared_local[(5)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(4)] * compute_d_shared_local[(6)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(4)] * compute_d_shared_local[(7)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(2)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(5)] * compute_d_shared_local[(3)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(5)] * compute_d_shared_local[(4)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(5)] * compute_d_shared_local[(5)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(5)] * compute_d_shared_local[(6)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(5)] * compute_d_shared_local[(7)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(2)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(6)] * compute_d_shared_local[(3)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(6)] * compute_d_shared_local[(4)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(6)] * compute_d_shared_local[(5)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(6)] * compute_d_shared_local[(6)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(6)] * compute_d_shared_local[(7)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(2)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(7)] * compute_d_shared_local[(3)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(7)] * compute_d_shared_local[(4)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(7)] * compute_d_shared_local[(5)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(7)] * compute_d_shared_local[(6)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(7)] * compute_d_shared_local[(7)]));\n }\n }\n compute[(((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)))] = max((compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605632))] = max((compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605632))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211264))] = max((compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211264))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816896))] = max((compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816896))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422528))] = max((compute_local[(32)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422528))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028160))] = max((compute_local[(40)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028160))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633792))] = max((compute_local[(48)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633792))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239424))] = max((compute_local[(56)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239424))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 16))] = max((compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 16))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605648))] = max((compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605648))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211280))] = max((compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211280))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816912))] = max((compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816912))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422544))] = max((compute_local[(33)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422544))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028176))] = max((compute_local[(41)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028176))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633808))] = max((compute_local[(49)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633808))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239440))] = max((compute_local[(57)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239440))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32))] = max((compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 32))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605664))] = max((compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605664))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211296))] = max((compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211296))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816928))] = max((compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816928))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422560))] = max((compute_local[(34)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422560))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028192))] = max((compute_local[(42)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028192))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633824))] = max((compute_local[(50)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633824))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239456))] = max((compute_local[(58)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239456))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 48))] = max((compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 48))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605680))] = max((compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605680))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211312))] = max((compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211312))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816944))] = max((compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816944))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422576))] = max((compute_local[(35)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422576))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028208))] = max((compute_local[(43)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028208))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633840))] = max((compute_local[(51)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633840))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239472))] = max((compute_local[(59)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239472))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 64))] = max((compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 64))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605696))] = max((compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605696))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211328))] = max((compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211328))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816960))] = max((compute_local[(28)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816960))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422592))] = max((compute_local[(36)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422592))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028224))] = max((compute_local[(44)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028224))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633856))] = max((compute_local[(52)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633856))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239488))] = max((compute_local[(60)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239488))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 80))] = max((compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 80))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605712))] = max((compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605712))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211344))] = max((compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211344))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816976))] = max((compute_local[(29)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816976))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422608))] = max((compute_local[(37)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422608))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028240))] = max((compute_local[(45)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028240))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633872))] = max((compute_local[(53)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633872))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239504))] = max((compute_local[(61)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239504))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 96))] = max((compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 96))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605728))] = max((compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605728))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211360))] = max((compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211360))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816992))] = max((compute_local[(30)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4816992))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422624))] = max((compute_local[(38)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422624))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028256))] = max((compute_local[(46)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028256))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633888))] = max((compute_local[(54)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633888))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239520))] = max((compute_local[(62)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239520))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 112))] = max((compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 112))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605744))] = max((compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 1605744))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211376))] = max((compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 3211376))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4817008))] = max((compute_local[(31)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 4817008))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422640))] = max((compute_local[(39)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 6422640))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028272))] = max((compute_local[(47)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 8028272))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633904))] = max((compute_local[(55)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 9633904))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239536))] = max((compute_local[(63)] + bias[((((((((int)threadIdx.x) >> 4) * 100352) + (((int)blockIdx.x) * 128)) + (((int)threadIdx.x) & 15)) + 11239536))]), 0.000000e+00f);\n}\n", "gridDim": [784, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,28,28]_[256,512,1,1]_[128,256,28,28].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,28,28]_[256,512,1,1]_[128,256,28,28].json new file mode 100644 index 000000000..ce98cdc7f --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,28,28]_[256,512,1,1]_[128,256,28,28].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 512, 28, 28], "filter_shape": [256, 512, 1, 1], "output_shape": [128, 256, 28, 28], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_512_28_28___256_512_1_1___128_256_28_28_", "code": "extern \"C\" __global__ void roller_Convolution__128_512_28_28___256_512_1_1___128_256_28_28_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[2048];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 16; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 6) * 784)) + ((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 784)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[((((((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 6) * 784)) + ((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 784)) + 3136))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[((((((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 6) * 784)) + ((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 784)) + 6272))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 6) * 784)) + ((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 784)) + 9408))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[((((((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 6) * 784)) + ((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 784)) + 12544))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[((((((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 6) * 784)) + ((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 784)) + 15680))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 6) * 784)) + ((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 784)) + 18816))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[((((((((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) / 784) * 401408) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 6) * 784)) + ((((((int)blockIdx.x) % 1568) * 64) + (((int)threadIdx.x) & 63)) % 784)) + 21952))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 1568) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 1568) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4096))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 1568) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 1568) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12288))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[(((((((((int)blockIdx.x) / 1568) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[(((((((((int)blockIdx.x) / 1568) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 20480))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 1568) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24576))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[(((((((((int)blockIdx.x) / 1568) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 28672))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[(((((((((int)blockIdx.x) / 1568) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32768))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 1568) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 36864))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[(((((((((int)blockIdx.x) / 1568) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 40960))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[(((((((((int)blockIdx.x) / 1568) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 45056))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[(((((((((int)blockIdx.x) / 1568) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 49152))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[(((((((((int)blockIdx.x) / 1568) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 53248))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[(((((((((int)blockIdx.x) / 1568) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 57344))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[(((((((((int)blockIdx.x) / 1568) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 61440))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 64) + (((int)threadIdx.x) & 15)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 16))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 32))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 48))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2560))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3584))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)))] = max((compute_local[(0)] + bias[((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 1605632))] = max((compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 1605632))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 3211264))] = max((compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 3211264))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 4816896))] = max((compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 4816896))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 6422528))] = max((compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 6422528))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 8028160))] = max((compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 8028160))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 9633792))] = max((compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 9633792))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 11239424))] = max((compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 11239424))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 16))] = max((compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 16))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 1605648))] = max((compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 1605648))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 3211280))] = max((compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 3211280))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 4816912))] = max((compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 4816912))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 6422544))] = max((compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 6422544))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 8028176))] = max((compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 8028176))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 9633808))] = max((compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 9633808))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 11239440))] = max((compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 11239440))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 32))] = max((compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 32))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 1605664))] = max((compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 1605664))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 3211296))] = max((compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 3211296))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 4816928))] = max((compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 4816928))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 6422560))] = max((compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 6422560))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 8028192))] = max((compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 8028192))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 9633824))] = max((compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 9633824))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 11239456))] = max((compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 11239456))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 48))] = max((compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 48))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 1605680))] = max((compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 1605680))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 3211312))] = max((compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 3211312))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 4816944))] = max((compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 4816944))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 6422576))] = max((compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 6422576))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 8028208))] = max((compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 8028208))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 9633840))] = max((compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 9633840))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 11239472))] = max((compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 1568) * 12845056) + ((((int)threadIdx.x) >> 4) * 100352)) + ((((int)blockIdx.x) % 1568) * 64)) + (((int)threadIdx.x) & 15)) + 11239472))]), 0.000000e+00f);\n}\n", "gridDim": [3136, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,7,7]_[2048,512,1,1]_[128,2048,7,7].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,7,7]_[2048,512,1,1]_[128,2048,7,7].json new file mode 100644 index 000000000..2903c8af0 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,7,7]_[2048,512,1,1]_[128,2048,7,7].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 512, 7, 7], "filter_shape": [2048, 512, 1, 1], "output_shape": [128, 2048, 7, 7], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_512_7_7___2048_512_1_1___128_2048_7_7_", "code": "extern \"C\" __global__ void roller_Convolution__128_512_7_7___2048_512_1_1___128_2048_7_7_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[8];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 16; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) / 49) * 25088) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 7) * 49)) + ((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) % 49)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[((((((((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) / 49) * 25088) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 7) * 49)) + ((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) % 49)) + 98))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[((((((((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) / 49) * 25088) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 7) * 49)) + ((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) % 49)) + 196))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) / 49) * 25088) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 7) * 49)) + ((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) % 49)) + 294))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[((((((((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) / 49) * 25088) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 7) * 49)) + ((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) % 49)) + 392))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[((((((((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) / 49) * 25088) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 7) * 49)) + ((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) % 49)) + 490))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) / 49) * 25088) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 7) * 49)) + ((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) % 49)) + 588))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[((((((((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) / 49) * 25088) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 7) * 49)) + ((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) % 49)) + 686))];\n compute_shared[((((int)threadIdx.x) + 2048))] = data[((((((((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) / 49) * 25088) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 7) * 49)) + ((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) % 49)) + 784))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[((((((((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) / 49) * 25088) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 7) * 49)) + ((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) % 49)) + 882))];\n compute_shared[((((int)threadIdx.x) + 2560))] = data[((((((((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) / 49) * 25088) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 7) * 49)) + ((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) % 49)) + 980))];\n compute_shared[((((int)threadIdx.x) + 2816))] = data[((((((((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) / 49) * 25088) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 7) * 49)) + ((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) % 49)) + 1078))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[((((((((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) / 49) * 25088) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 7) * 49)) + ((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) % 49)) + 1176))];\n compute_shared[((((int)threadIdx.x) + 3328))] = data[((((((((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) / 49) * 25088) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 7) * 49)) + ((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) % 49)) + 1274))];\n compute_shared[((((int)threadIdx.x) + 3584))] = data[((((((((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) / 49) * 25088) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 7) * 49)) + ((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) % 49)) + 1372))];\n compute_shared[((((int)threadIdx.x) + 3840))] = data[((((((((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) / 49) * 25088) + (k_outer * 1568)) + ((((int)threadIdx.x) >> 7) * 49)) + ((((((int)blockIdx.x) % 49) * 128) + (((int)threadIdx.x) & 127)) % 49)) + 1470))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 49) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 49) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4096))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 49) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 8192))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 49) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 12288))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[(((((((((int)blockIdx.x) / 49) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[(((((((((int)blockIdx.x) / 49) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 20480))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 49) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 24576))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[(((((((((int)blockIdx.x) / 49) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 28672))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[(((((((((int)blockIdx.x) / 49) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32768))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 49) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 36864))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[(((((((((int)blockIdx.x) / 49) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 40960))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[(((((((((int)blockIdx.x) / 49) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 45056))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[(((((((((int)blockIdx.x) / 49) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 49152))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[(((((((((int)blockIdx.x) / 49) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 53248))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[(((((((((int)blockIdx.x) / 49) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 57344))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[(((((((((int)blockIdx.x) / 49) * 65536) + ((((int)threadIdx.x) >> 5) * 512)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 61440))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 15)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 16))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 32))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 48))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 64))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 80))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 96))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 15)) + 112))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2560))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3584))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(2)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(4)] * compute_d_shared_local[(3)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(4)] * compute_d_shared_local[(4)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(4)] * compute_d_shared_local[(5)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(4)] * compute_d_shared_local[(6)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(4)] * compute_d_shared_local[(7)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(2)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(5)] * compute_d_shared_local[(3)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(5)] * compute_d_shared_local[(4)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(5)] * compute_d_shared_local[(5)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(5)] * compute_d_shared_local[(6)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(5)] * compute_d_shared_local[(7)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(2)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(6)] * compute_d_shared_local[(3)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(6)] * compute_d_shared_local[(4)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(6)] * compute_d_shared_local[(5)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(6)] * compute_d_shared_local[(6)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(6)] * compute_d_shared_local[(7)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(2)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(7)] * compute_d_shared_local[(3)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(7)] * compute_d_shared_local[(4)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(7)] * compute_d_shared_local[(5)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(7)] * compute_d_shared_local[(6)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(7)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)))] = (compute_local[(0)] + bias[((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 100352))] = (compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 100352))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 200704))] = (compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 200704))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 301056))] = (compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 301056))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 401408))] = (compute_local[(32)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 401408))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 501760))] = (compute_local[(40)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 501760))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 602112))] = (compute_local[(48)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 602112))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 702464))] = (compute_local[(56)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 702464))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 16))] = (compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 16))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 100368))] = (compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 100368))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 200720))] = (compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 200720))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 301072))] = (compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 301072))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 401424))] = (compute_local[(33)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 401424))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 501776))] = (compute_local[(41)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 501776))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 602128))] = (compute_local[(49)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 602128))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 702480))] = (compute_local[(57)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 702480))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 32))] = (compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 32))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 100384))] = (compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 100384))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 200736))] = (compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 200736))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 301088))] = (compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 301088))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 401440))] = (compute_local[(34)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 401440))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 501792))] = (compute_local[(42)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 501792))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 602144))] = (compute_local[(50)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 602144))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 702496))] = (compute_local[(58)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 702496))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 48))] = (compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 48))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 100400))] = (compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 100400))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 200752))] = (compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 200752))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 301104))] = (compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 301104))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 401456))] = (compute_local[(35)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 401456))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 501808))] = (compute_local[(43)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 501808))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 602160))] = (compute_local[(51)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 602160))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 702512))] = (compute_local[(59)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 702512))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 64))] = (compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 64))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 100416))] = (compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 100416))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 200768))] = (compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 200768))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 301120))] = (compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 301120))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 401472))] = (compute_local[(36)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 401472))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 501824))] = (compute_local[(44)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 501824))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 602176))] = (compute_local[(52)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 602176))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 702528))] = (compute_local[(60)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 702528))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 80))] = (compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 80))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 100432))] = (compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 100432))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 200784))] = (compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 200784))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 301136))] = (compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 301136))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 401488))] = (compute_local[(37)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 401488))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 501840))] = (compute_local[(45)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 501840))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 602192))] = (compute_local[(53)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 602192))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 702544))] = (compute_local[(61)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 702544))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 96))] = (compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 96))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 100448))] = (compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 100448))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 200800))] = (compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 200800))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 301152))] = (compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 301152))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 401504))] = (compute_local[(38)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 401504))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 501856))] = (compute_local[(46)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 501856))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 602208))] = (compute_local[(54)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 602208))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 702560))] = (compute_local[(62)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 702560))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 112))] = (compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 112))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 100464))] = (compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 100464))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 200816))] = (compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 200816))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 301168))] = (compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 301168))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 401520))] = (compute_local[(39)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 401520))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 501872))] = (compute_local[(47)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 501872))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 602224))] = (compute_local[(55)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 602224))]);\n compute[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 702576))] = (compute_local[(63)] + bias[(((((((((int)blockIdx.x) / 49) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 49) * 128)) + (((int)threadIdx.x) & 15)) + 702576))]);\n}\n", "gridDim": [784, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,7,7]_[512,512,3,3]_[128,512,7,7].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,7,7]_[512,512,3,3]_[128,512,7,7].json new file mode 100644 index 000000000..655713c66 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,512,7,7]_[512,512,3,3]_[128,512,7,7].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 512, 7, 7], "filter_shape": [512, 512, 3, 3], "output_shape": [128, 512, 7, 7], "window_movement_strides": [1, 1], "padding_below_diff": [1, 1], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_512_7_7___512_512_3_3___128_512_7_7_", "code": "extern \"C\" __global__ void roller_Convolution__128_512_7_7___512_512_3_3___128_512_7_7_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[2048];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[4];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 144; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 41472) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) / 9) * 81)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 9)) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) % 9) / 3) * 9)) + ((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7)) + (((k_outer * 32) + (((int)threadIdx.x) >> 6)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 41472) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 4) / 9) * 81)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 9)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 4) % 9) / 3) * 9)) + ((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 41472) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 8) / 9) * 81)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 9)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 8) % 9) / 3) * 9)) + ((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 41472) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 12) / 9) * 81)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 9)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 3) % 9) / 3) * 9)) + ((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7)) + (((k_outer * 32) + (((int)threadIdx.x) >> 6)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 41472) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 16) / 9) * 81)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 9)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 7) % 9) / 3) * 9)) + ((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 41472) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 20) / 9) * 81)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 9)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 2) % 9) / 3) * 9)) + ((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 41472) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 24) / 9) * 81)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 9)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 6) % 9) / 3) * 9)) + ((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7)) + (((k_outer * 32) + (((int)threadIdx.x) >> 6)) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[(((((((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) / 49) * 41472) + (((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 28) / 9) * 81)) + ((((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 49) / 7) * 9)) + ((((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 1) % 9) / 3) * 9)) + ((((((int)blockIdx.x) % 98) * 64) + (((int)threadIdx.x) & 63)) % 7)) + ((((k_outer * 32) + (((int)threadIdx.x) >> 6)) + 1) % 3)))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 36864))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 73728))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 110592))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 147456))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 184320))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 221184))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 258048))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 294912))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 331776))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 368640))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 405504))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 442368))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 479232))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 516096))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[(((((((((int)blockIdx.x) / 98) * 589824) + ((((int)threadIdx.x) >> 5) * 4608)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 552960))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 64) + (((int)threadIdx.x) & 15)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 16))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 32))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 64) + (((int)threadIdx.x) & 15)) + 48))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 2560))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 32) + k_inner_outer) + 3584))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)))] = max((compute_local[(0)] + bias[((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 100352))] = max((compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 100352))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 200704))] = max((compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 200704))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 301056))] = max((compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 301056))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 401408))] = max((compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 401408))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 501760))] = max((compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 501760))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 602112))] = max((compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 602112))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 702464))] = max((compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 702464))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 16))] = max((compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 16))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 100368))] = max((compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 100368))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 200720))] = max((compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 200720))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 301072))] = max((compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 301072))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 401424))] = max((compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 401424))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 501776))] = max((compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 501776))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 602128))] = max((compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 602128))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 702480))] = max((compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 702480))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 32))] = max((compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 32))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 100384))] = max((compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 100384))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 200736))] = max((compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 200736))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 301088))] = max((compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 301088))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 401440))] = max((compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 401440))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 501792))] = max((compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 501792))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 602144))] = max((compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 602144))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 702496))] = max((compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 702496))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 48))] = max((compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 48))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 100400))] = max((compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 100400))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 200752))] = max((compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 200752))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 301104))] = max((compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 301104))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 401456))] = max((compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 401456))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 501808))] = max((compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 501808))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 602160))] = max((compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 602160))]), 0.000000e+00f);\n compute[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 702512))] = max((compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 98) * 802816) + ((((int)threadIdx.x) >> 4) * 6272)) + ((((int)blockIdx.x) % 98) * 64)) + (((int)threadIdx.x) & 15)) + 702512))]), 0.000000e+00f);\n}\n", "gridDim": [392, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,64,56,56]_[256,64,1,1]_[128,256,56,56].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,64,56,56]_[256,64,1,1]_[128,256,56,56].json new file mode 100644 index 000000000..6a2fcd4ce --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,64,56,56]_[256,64,1,1]_[128,256,56,56].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 64, 56, 56], "filter_shape": [256, 64, 1, 1], "output_shape": [128, 256, 56, 56], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add", "tvm_func_name": "roller_Convolution__128_64_56_56___256_64_1_1___128_256_56_56_", "code": "extern \"C\" __global__ void roller_Convolution__128_64_56_56___256_64_1_1___128_256_56_56_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[4096];\n __shared__ float compute_d_shared[4096];\n float compute_shared_local[4];\n float compute_d_shared_local[16];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 2; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[(((((((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 200704) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + ((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) % 3136)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[((((((((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 200704) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + ((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 6272))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[((((((((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 200704) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + ((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 12544))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 200704) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + ((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 18816))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[((((((((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 200704) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + ((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 25088))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[((((((((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 200704) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + ((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 31360))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 200704) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + ((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 37632))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[((((((((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 200704) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + ((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 43904))];\n compute_shared[((((int)threadIdx.x) + 2048))] = data[((((((((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 200704) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + ((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 50176))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[((((((((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 200704) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + ((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 56448))];\n compute_shared[((((int)threadIdx.x) + 2560))] = data[((((((((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 200704) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + ((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 62720))];\n compute_shared[((((int)threadIdx.x) + 2816))] = data[((((((((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 200704) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + ((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 68992))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[((((((((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 200704) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + ((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 75264))];\n compute_shared[((((int)threadIdx.x) + 3328))] = data[((((((((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 200704) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + ((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 81536))];\n compute_shared[((((int)threadIdx.x) + 3584))] = data[((((((((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 200704) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + ((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 87808))];\n compute_shared[((((int)threadIdx.x) + 3840))] = data[((((((((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) / 3136) * 200704) + (k_outer * 100352)) + ((((int)threadIdx.x) >> 7) * 3136)) + ((((((int)blockIdx.x) % 3136) * 128) + (((int)threadIdx.x) & 127)) % 3136)) + 94080))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) / 3136) * 8192) + ((((int)threadIdx.x) >> 5) * 64)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[(((((((((int)blockIdx.x) / 3136) * 8192) + ((((int)threadIdx.x) >> 5) * 64)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 512))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[(((((((((int)blockIdx.x) / 3136) * 8192) + ((((int)threadIdx.x) >> 5) * 64)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1024))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[(((((((((int)blockIdx.x) / 3136) * 8192) + ((((int)threadIdx.x) >> 5) * 64)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 1536))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[(((((((((int)blockIdx.x) / 3136) * 8192) + ((((int)threadIdx.x) >> 5) * 64)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2048))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[(((((((((int)blockIdx.x) / 3136) * 8192) + ((((int)threadIdx.x) >> 5) * 64)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 2560))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[(((((((((int)blockIdx.x) / 3136) * 8192) + ((((int)threadIdx.x) >> 5) * 64)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 3072))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[(((((((((int)blockIdx.x) / 3136) * 8192) + ((((int)threadIdx.x) >> 5) * 64)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 3584))];\n compute_d_shared[((((int)threadIdx.x) + 2048))] = kernel[(((((((((int)blockIdx.x) / 3136) * 8192) + ((((int)threadIdx.x) >> 5) * 64)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4096))];\n compute_d_shared[((((int)threadIdx.x) + 2304))] = kernel[(((((((((int)blockIdx.x) / 3136) * 8192) + ((((int)threadIdx.x) >> 5) * 64)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4608))];\n compute_d_shared[((((int)threadIdx.x) + 2560))] = kernel[(((((((((int)blockIdx.x) / 3136) * 8192) + ((((int)threadIdx.x) >> 5) * 64)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 5120))];\n compute_d_shared[((((int)threadIdx.x) + 2816))] = kernel[(((((((((int)blockIdx.x) / 3136) * 8192) + ((((int)threadIdx.x) >> 5) * 64)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 5632))];\n compute_d_shared[((((int)threadIdx.x) + 3072))] = kernel[(((((((((int)blockIdx.x) / 3136) * 8192) + ((((int)threadIdx.x) >> 5) * 64)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 6144))];\n compute_d_shared[((((int)threadIdx.x) + 3328))] = kernel[(((((((((int)blockIdx.x) / 3136) * 8192) + ((((int)threadIdx.x) >> 5) * 64)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 6656))];\n compute_d_shared[((((int)threadIdx.x) + 3584))] = kernel[(((((((((int)blockIdx.x) / 3136) * 8192) + ((((int)threadIdx.x) >> 5) * 64)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 7168))];\n compute_d_shared[((((int)threadIdx.x) + 3840))] = kernel[(((((((((int)blockIdx.x) / 3136) * 8192) + ((((int)threadIdx.x) >> 5) * 64)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 7680))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 128) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 128) + (((int)threadIdx.x) & 31)) + 96))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 256))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1280))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1792))];\n compute_d_shared_local[(8)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2048))];\n compute_d_shared_local[(9)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2304))];\n compute_d_shared_local[(10)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2560))];\n compute_d_shared_local[(11)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 2816))];\n compute_d_shared_local[(12)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3072))];\n compute_d_shared_local[(13)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3328))];\n compute_d_shared_local[(14)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3584))];\n compute_d_shared_local[(15)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 3840))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(8)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(0)] * compute_d_shared_local[(9)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(10)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(0)] * compute_d_shared_local[(11)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(12)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(0)] * compute_d_shared_local[(13)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(14)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(0)] * compute_d_shared_local[(15)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(8)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(1)] * compute_d_shared_local[(9)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(10)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(1)] * compute_d_shared_local[(11)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(12)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(1)] * compute_d_shared_local[(13)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(14)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(1)] * compute_d_shared_local[(15)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(8)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(2)] * compute_d_shared_local[(9)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(10)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(2)] * compute_d_shared_local[(11)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(12)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(2)] * compute_d_shared_local[(13)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(14)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(2)] * compute_d_shared_local[(15)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(8)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(3)] * compute_d_shared_local[(9)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(10)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(3)] * compute_d_shared_local[(11)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(12)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(3)] * compute_d_shared_local[(13)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(14)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(3)] * compute_d_shared_local[(15)]));\n }\n }\n compute[((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)))] = (compute_local[(0)] + bias[((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 3211264))] = (compute_local[(4)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 3211264))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 6422528))] = (compute_local[(8)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 6422528))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 9633792))] = (compute_local[(12)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 9633792))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 12845056))] = (compute_local[(16)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 12845056))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 16056320))] = (compute_local[(20)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 16056320))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 19267584))] = (compute_local[(24)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 19267584))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 22478848))] = (compute_local[(28)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 22478848))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 25690112))] = (compute_local[(32)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 25690112))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 28901376))] = (compute_local[(36)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 28901376))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 32112640))] = (compute_local[(40)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 32112640))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 35323904))] = (compute_local[(44)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 35323904))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 38535168))] = (compute_local[(48)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 38535168))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 41746432))] = (compute_local[(52)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 41746432))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 44957696))] = (compute_local[(56)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 44957696))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 48168960))] = (compute_local[(60)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 48168960))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 32))] = (compute_local[(1)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 32))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 3211296))] = (compute_local[(5)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 3211296))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 6422560))] = (compute_local[(9)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 6422560))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 9633824))] = (compute_local[(13)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 9633824))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 12845088))] = (compute_local[(17)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 12845088))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 16056352))] = (compute_local[(21)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 16056352))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 19267616))] = (compute_local[(25)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 19267616))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 22478880))] = (compute_local[(29)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 22478880))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 25690144))] = (compute_local[(33)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 25690144))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 28901408))] = (compute_local[(37)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 28901408))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 32112672))] = (compute_local[(41)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 32112672))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 35323936))] = (compute_local[(45)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 35323936))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 38535200))] = (compute_local[(49)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 38535200))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 41746464))] = (compute_local[(53)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 41746464))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 44957728))] = (compute_local[(57)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 44957728))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 48168992))] = (compute_local[(61)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 48168992))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 64))] = (compute_local[(2)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 64))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 3211328))] = (compute_local[(6)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 3211328))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 6422592))] = (compute_local[(10)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 6422592))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 9633856))] = (compute_local[(14)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 9633856))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 12845120))] = (compute_local[(18)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 12845120))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 16056384))] = (compute_local[(22)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 16056384))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 19267648))] = (compute_local[(26)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 19267648))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 22478912))] = (compute_local[(30)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 22478912))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 25690176))] = (compute_local[(34)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 25690176))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 28901440))] = (compute_local[(38)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 28901440))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 32112704))] = (compute_local[(42)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 32112704))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 35323968))] = (compute_local[(46)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 35323968))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 38535232))] = (compute_local[(50)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 38535232))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 41746496))] = (compute_local[(54)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 41746496))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 44957760))] = (compute_local[(58)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 44957760))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 48169024))] = (compute_local[(62)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 48169024))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 96))] = (compute_local[(3)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 96))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 3211360))] = (compute_local[(7)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 3211360))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 6422624))] = (compute_local[(11)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 6422624))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 9633888))] = (compute_local[(15)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 9633888))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 12845152))] = (compute_local[(19)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 12845152))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 16056416))] = (compute_local[(23)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 16056416))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 19267680))] = (compute_local[(27)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 19267680))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 22478944))] = (compute_local[(31)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 22478944))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 25690208))] = (compute_local[(35)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 25690208))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 28901472))] = (compute_local[(39)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 28901472))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 32112736))] = (compute_local[(43)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 32112736))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 35324000))] = (compute_local[(47)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 35324000))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 38535264))] = (compute_local[(51)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 38535264))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 41746528))] = (compute_local[(55)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 41746528))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 44957792))] = (compute_local[(59)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 44957792))]);\n compute[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 48169056))] = (compute_local[(63)] + bias[(((((((((int)blockIdx.x) / 3136) * 51380224) + ((((int)threadIdx.x) >> 5) * 401408)) + ((((int)blockIdx.x) % 3136) * 128)) + (((int)threadIdx.x) & 31)) + 48169056))]);\n}\n", "gridDim": [6272, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,64,56,56]_[64,64,1,1]_[128,64,56,56].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,64,56,56]_[64,64,1,1]_[128,64,56,56].json new file mode 100644 index 000000000..d08ee6422 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,64,56,56]_[64,64,1,1]_[128,64,56,56].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 64, 56, 56], "filter_shape": [64, 64, 1, 1], "output_shape": [128, 64, 56, 56], "window_movement_strides": [1, 1], "padding_below_diff": [0, 0], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_64_56_56___64_64_1_1___128_64_56_56_", "code": "extern \"C\" __global__ void roller_Convolution__128_64_56_56___64_64_1_1___128_64_56_56_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[32];\n __shared__ float compute_shared[2048];\n __shared__ float compute_d_shared[512];\n float compute_shared_local[16];\n float compute_d_shared_local[2];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 8; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 3136) * 200704) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 8) * 3136)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 3136)))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[(((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 3136) * 200704) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 8) * 3136)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 3136)) + 6272))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[(((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 3136) * 200704) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 8) * 3136)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 3136)) + 12544))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[(((((((((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) / 3136) * 200704) + (k_outer * 25088)) + ((((int)threadIdx.x) >> 8) * 3136)) + (((((int)blockIdx.x) * 256) + (((int)threadIdx.x) & 255)) % 3136)) + 18816))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[(((((((int)threadIdx.x) >> 3) * 64) + (k_outer * 8)) + (((int)threadIdx.x) & 7)))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 8; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 256) + (((int)threadIdx.x) & 15)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 15)) + 16))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 15)) + 32))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 15)) + 48))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 15)) + 64))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 15)) + 80))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 15)) + 96))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 15)) + 112))];\n compute_shared_local[(8)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 15)) + 128))];\n compute_shared_local[(9)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 15)) + 144))];\n compute_shared_local[(10)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 15)) + 160))];\n compute_shared_local[(11)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 15)) + 176))];\n compute_shared_local[(12)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 15)) + 192))];\n compute_shared_local[(13)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 15)) + 208))];\n compute_shared_local[(14)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 15)) + 224))];\n compute_shared_local[(15)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 15)) + 240))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 4) * 8) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 4) * 8) + k_inner_outer) + 256))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(8)] * compute_d_shared_local[(0)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(8)] * compute_d_shared_local[(1)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(9)] * compute_d_shared_local[(0)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(9)] * compute_d_shared_local[(1)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(10)] * compute_d_shared_local[(0)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(10)] * compute_d_shared_local[(1)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(11)] * compute_d_shared_local[(0)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(11)] * compute_d_shared_local[(1)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(12)] * compute_d_shared_local[(0)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(12)] * compute_d_shared_local[(1)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(13)] * compute_d_shared_local[(0)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(13)] * compute_d_shared_local[(1)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(14)] * compute_d_shared_local[(0)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(14)] * compute_d_shared_local[(1)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(15)] * compute_d_shared_local[(0)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(15)] * compute_d_shared_local[(1)]));\n }\n }\n compute[(((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)))] = max((compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845056))] = max((compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845056))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 16))] = max((compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 16))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845072))] = max((compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845072))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 32))] = max((compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 32))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845088))] = max((compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845088))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 48))] = max((compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 48))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845104))] = max((compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845104))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 64))] = max((compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 64))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845120))] = max((compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845120))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 80))] = max((compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 80))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845136))] = max((compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845136))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 96))] = max((compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 96))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845152))] = max((compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845152))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 112))] = max((compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 112))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845168))] = max((compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845168))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 128))] = max((compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 128))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845184))] = max((compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845184))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 144))] = max((compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 144))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845200))] = max((compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845200))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 160))] = max((compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 160))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845216))] = max((compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845216))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 176))] = max((compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 176))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845232))] = max((compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845232))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 192))] = max((compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 192))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845248))] = max((compute_local[(28)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845248))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 208))] = max((compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 208))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845264))] = max((compute_local[(29)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845264))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 224))] = max((compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 224))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845280))] = max((compute_local[(30)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845280))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 240))] = max((compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 240))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845296))] = max((compute_local[(31)] + bias[((((((((int)threadIdx.x) >> 4) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 15)) + 12845296))]), 0.000000e+00f);\n}\n", "gridDim": [1568, 1, 1], "blockDim": [512, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,64,56,56]_[64,64,3,3]_[128,64,56,56].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,64,56,56]_[64,64,3,3]_[128,64,56,56].json new file mode 100644 index 000000000..721c81162 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Convolution_[128,64,56,56]_[64,64,3,3]_[128,64,56,56].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 64, 56, 56], "filter_shape": [64, 64, 3, 3], "output_shape": [128, 64, 56, 56], "window_movement_strides": [1, 1], "padding_below_diff": [1, 1], "window_dilation_strides": [1, 1]}, "op_type": "Fused_Convolution_Add_Relu", "tvm_func_name": "roller_Convolution__128_64_56_56___64_64_3_3___128_64_56_56_", "code": "extern \"C\" __global__ void roller_Convolution__128_64_56_56___64_64_3_3___128_64_56_56_(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {\n float compute_local[64];\n __shared__ float compute_shared[8192];\n __shared__ float compute_d_shared[2048];\n float compute_shared_local[8];\n float compute_d_shared_local[8];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(8)] = 0.000000e+00f;\n compute_local[(16)] = 0.000000e+00f;\n compute_local[(24)] = 0.000000e+00f;\n compute_local[(32)] = 0.000000e+00f;\n compute_local[(40)] = 0.000000e+00f;\n compute_local[(48)] = 0.000000e+00f;\n compute_local[(56)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(9)] = 0.000000e+00f;\n compute_local[(17)] = 0.000000e+00f;\n compute_local[(25)] = 0.000000e+00f;\n compute_local[(33)] = 0.000000e+00f;\n compute_local[(41)] = 0.000000e+00f;\n compute_local[(49)] = 0.000000e+00f;\n compute_local[(57)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(10)] = 0.000000e+00f;\n compute_local[(18)] = 0.000000e+00f;\n compute_local[(26)] = 0.000000e+00f;\n compute_local[(34)] = 0.000000e+00f;\n compute_local[(42)] = 0.000000e+00f;\n compute_local[(50)] = 0.000000e+00f;\n compute_local[(58)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n compute_local[(11)] = 0.000000e+00f;\n compute_local[(19)] = 0.000000e+00f;\n compute_local[(27)] = 0.000000e+00f;\n compute_local[(35)] = 0.000000e+00f;\n compute_local[(43)] = 0.000000e+00f;\n compute_local[(51)] = 0.000000e+00f;\n compute_local[(59)] = 0.000000e+00f;\n compute_local[(4)] = 0.000000e+00f;\n compute_local[(12)] = 0.000000e+00f;\n compute_local[(20)] = 0.000000e+00f;\n compute_local[(28)] = 0.000000e+00f;\n compute_local[(36)] = 0.000000e+00f;\n compute_local[(44)] = 0.000000e+00f;\n compute_local[(52)] = 0.000000e+00f;\n compute_local[(60)] = 0.000000e+00f;\n compute_local[(5)] = 0.000000e+00f;\n compute_local[(13)] = 0.000000e+00f;\n compute_local[(21)] = 0.000000e+00f;\n compute_local[(29)] = 0.000000e+00f;\n compute_local[(37)] = 0.000000e+00f;\n compute_local[(45)] = 0.000000e+00f;\n compute_local[(53)] = 0.000000e+00f;\n compute_local[(61)] = 0.000000e+00f;\n compute_local[(6)] = 0.000000e+00f;\n compute_local[(14)] = 0.000000e+00f;\n compute_local[(22)] = 0.000000e+00f;\n compute_local[(30)] = 0.000000e+00f;\n compute_local[(38)] = 0.000000e+00f;\n compute_local[(46)] = 0.000000e+00f;\n compute_local[(54)] = 0.000000e+00f;\n compute_local[(62)] = 0.000000e+00f;\n compute_local[(7)] = 0.000000e+00f;\n compute_local[(15)] = 0.000000e+00f;\n compute_local[(23)] = 0.000000e+00f;\n compute_local[(31)] = 0.000000e+00f;\n compute_local[(39)] = 0.000000e+00f;\n compute_local[(47)] = 0.000000e+00f;\n compute_local[(55)] = 0.000000e+00f;\n compute_local[(63)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 18; ++k_outer) {\n __syncthreads();\n compute_shared[(((int)threadIdx.x))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + (((k_outer * 32) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + ((((k_outer * 32) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + ((k_outer * 32) % 3)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 1) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 1) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 2) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 2) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 768))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 3) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 3) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + ((k_outer * 32) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1024))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 4) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 4) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1280))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 5) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 5) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1536))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 6) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 6) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + ((k_outer * 32) % 3)))];\n compute_shared[((((int)threadIdx.x) + 1792))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 7) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 7) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 2048))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 8) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 8) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 2304))] = data[(((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + (((k_outer * 32) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + ((((k_outer * 32) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + ((k_outer * 32) % 3)) + 3364))];\n compute_shared[((((int)threadIdx.x) + 2560))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 10) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 1) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 2816))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 11) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 2) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 3072))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 12) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 3) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + ((k_outer * 32) % 3)))];\n compute_shared[((((int)threadIdx.x) + 3328))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 13) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 4) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 3584))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 14) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 5) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 3840))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 15) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 6) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + ((k_outer * 32) % 3)))];\n compute_shared[((((int)threadIdx.x) + 4096))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 16) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 7) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 4352))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 17) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 8) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 4608))] = data[(((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + (((k_outer * 32) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + ((((k_outer * 32) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + ((k_outer * 32) % 3)) + 6728))];\n compute_shared[((((int)threadIdx.x) + 4864))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 19) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 1) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 5120))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 20) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 2) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 5376))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 21) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 3) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + ((k_outer * 32) % 3)))];\n compute_shared[((((int)threadIdx.x) + 5632))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 22) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 4) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 5888))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 23) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 5) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 6144))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 24) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 6) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + ((k_outer * 32) % 3)))];\n compute_shared[((((int)threadIdx.x) + 6400))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 25) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 7) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 6656))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 26) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 8) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 6912))] = data[(((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + (((k_outer * 32) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + ((((k_outer * 32) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + ((k_outer * 32) % 3)) + 10092))];\n compute_shared[((((int)threadIdx.x) + 7168))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 28) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 1) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 1) % 3)))];\n compute_shared[((((int)threadIdx.x) + 7424))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 29) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 2) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 2) % 3)))];\n compute_shared[((((int)threadIdx.x) + 7680))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 30) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 3) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + ((k_outer * 32) % 3)))];\n compute_shared[((((int)threadIdx.x) + 7936))] = data[((((((((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) / 3136) * 215296) + ((((k_outer * 32) + 31) / 9) * 3364)) + (((((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 3136) / 56) * 58)) + (((((k_outer * 32) + 4) % 9) / 3) * 58)) + (((((int)blockIdx.x) * 256) + ((int)threadIdx.x)) % 56)) + (((k_outer * 32) + 1) % 3)))];\n compute_d_shared[(((int)threadIdx.x))] = kernel[(((((((int)threadIdx.x) >> 5) * 576) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n compute_d_shared[((((int)threadIdx.x) + 256))] = kernel[((((((((int)threadIdx.x) >> 5) * 576) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 4608))];\n compute_d_shared[((((int)threadIdx.x) + 512))] = kernel[((((((((int)threadIdx.x) >> 5) * 576) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 9216))];\n compute_d_shared[((((int)threadIdx.x) + 768))] = kernel[((((((((int)threadIdx.x) >> 5) * 576) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 13824))];\n compute_d_shared[((((int)threadIdx.x) + 1024))] = kernel[((((((((int)threadIdx.x) >> 5) * 576) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 18432))];\n compute_d_shared[((((int)threadIdx.x) + 1280))] = kernel[((((((((int)threadIdx.x) >> 5) * 576) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 23040))];\n compute_d_shared[((((int)threadIdx.x) + 1536))] = kernel[((((((((int)threadIdx.x) >> 5) * 576) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 27648))];\n compute_d_shared[((((int)threadIdx.x) + 1792))] = kernel[((((((((int)threadIdx.x) >> 5) * 576) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32256))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n compute_shared_local[(0)] = compute_shared[(((k_inner_outer * 256) + (((int)threadIdx.x) & 31)))];\n compute_shared_local[(1)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 32))];\n compute_shared_local[(2)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 64))];\n compute_shared_local[(3)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 96))];\n compute_shared_local[(4)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 128))];\n compute_shared_local[(5)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 160))];\n compute_shared_local[(6)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 192))];\n compute_shared_local[(7)] = compute_shared[((((k_inner_outer * 256) + (((int)threadIdx.x) & 31)) + 224))];\n compute_d_shared_local[(0)] = compute_d_shared[((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer))];\n compute_d_shared_local[(1)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 256))];\n compute_d_shared_local[(2)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 512))];\n compute_d_shared_local[(3)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 768))];\n compute_d_shared_local[(4)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1024))];\n compute_d_shared_local[(5)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1280))];\n compute_d_shared_local[(6)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1536))];\n compute_d_shared_local[(7)] = compute_d_shared[(((((((int)threadIdx.x) >> 5) * 32) + k_inner_outer) + 1792))];\n compute_local[(0)] = (compute_local[(0)] + (compute_shared_local[(0)] * compute_d_shared_local[(0)]));\n compute_local[(8)] = (compute_local[(8)] + (compute_shared_local[(0)] * compute_d_shared_local[(1)]));\n compute_local[(16)] = (compute_local[(16)] + (compute_shared_local[(0)] * compute_d_shared_local[(2)]));\n compute_local[(24)] = (compute_local[(24)] + (compute_shared_local[(0)] * compute_d_shared_local[(3)]));\n compute_local[(32)] = (compute_local[(32)] + (compute_shared_local[(0)] * compute_d_shared_local[(4)]));\n compute_local[(40)] = (compute_local[(40)] + (compute_shared_local[(0)] * compute_d_shared_local[(5)]));\n compute_local[(48)] = (compute_local[(48)] + (compute_shared_local[(0)] * compute_d_shared_local[(6)]));\n compute_local[(56)] = (compute_local[(56)] + (compute_shared_local[(0)] * compute_d_shared_local[(7)]));\n compute_local[(1)] = (compute_local[(1)] + (compute_shared_local[(1)] * compute_d_shared_local[(0)]));\n compute_local[(9)] = (compute_local[(9)] + (compute_shared_local[(1)] * compute_d_shared_local[(1)]));\n compute_local[(17)] = (compute_local[(17)] + (compute_shared_local[(1)] * compute_d_shared_local[(2)]));\n compute_local[(25)] = (compute_local[(25)] + (compute_shared_local[(1)] * compute_d_shared_local[(3)]));\n compute_local[(33)] = (compute_local[(33)] + (compute_shared_local[(1)] * compute_d_shared_local[(4)]));\n compute_local[(41)] = (compute_local[(41)] + (compute_shared_local[(1)] * compute_d_shared_local[(5)]));\n compute_local[(49)] = (compute_local[(49)] + (compute_shared_local[(1)] * compute_d_shared_local[(6)]));\n compute_local[(57)] = (compute_local[(57)] + (compute_shared_local[(1)] * compute_d_shared_local[(7)]));\n compute_local[(2)] = (compute_local[(2)] + (compute_shared_local[(2)] * compute_d_shared_local[(0)]));\n compute_local[(10)] = (compute_local[(10)] + (compute_shared_local[(2)] * compute_d_shared_local[(1)]));\n compute_local[(18)] = (compute_local[(18)] + (compute_shared_local[(2)] * compute_d_shared_local[(2)]));\n compute_local[(26)] = (compute_local[(26)] + (compute_shared_local[(2)] * compute_d_shared_local[(3)]));\n compute_local[(34)] = (compute_local[(34)] + (compute_shared_local[(2)] * compute_d_shared_local[(4)]));\n compute_local[(42)] = (compute_local[(42)] + (compute_shared_local[(2)] * compute_d_shared_local[(5)]));\n compute_local[(50)] = (compute_local[(50)] + (compute_shared_local[(2)] * compute_d_shared_local[(6)]));\n compute_local[(58)] = (compute_local[(58)] + (compute_shared_local[(2)] * compute_d_shared_local[(7)]));\n compute_local[(3)] = (compute_local[(3)] + (compute_shared_local[(3)] * compute_d_shared_local[(0)]));\n compute_local[(11)] = (compute_local[(11)] + (compute_shared_local[(3)] * compute_d_shared_local[(1)]));\n compute_local[(19)] = (compute_local[(19)] + (compute_shared_local[(3)] * compute_d_shared_local[(2)]));\n compute_local[(27)] = (compute_local[(27)] + (compute_shared_local[(3)] * compute_d_shared_local[(3)]));\n compute_local[(35)] = (compute_local[(35)] + (compute_shared_local[(3)] * compute_d_shared_local[(4)]));\n compute_local[(43)] = (compute_local[(43)] + (compute_shared_local[(3)] * compute_d_shared_local[(5)]));\n compute_local[(51)] = (compute_local[(51)] + (compute_shared_local[(3)] * compute_d_shared_local[(6)]));\n compute_local[(59)] = (compute_local[(59)] + (compute_shared_local[(3)] * compute_d_shared_local[(7)]));\n compute_local[(4)] = (compute_local[(4)] + (compute_shared_local[(4)] * compute_d_shared_local[(0)]));\n compute_local[(12)] = (compute_local[(12)] + (compute_shared_local[(4)] * compute_d_shared_local[(1)]));\n compute_local[(20)] = (compute_local[(20)] + (compute_shared_local[(4)] * compute_d_shared_local[(2)]));\n compute_local[(28)] = (compute_local[(28)] + (compute_shared_local[(4)] * compute_d_shared_local[(3)]));\n compute_local[(36)] = (compute_local[(36)] + (compute_shared_local[(4)] * compute_d_shared_local[(4)]));\n compute_local[(44)] = (compute_local[(44)] + (compute_shared_local[(4)] * compute_d_shared_local[(5)]));\n compute_local[(52)] = (compute_local[(52)] + (compute_shared_local[(4)] * compute_d_shared_local[(6)]));\n compute_local[(60)] = (compute_local[(60)] + (compute_shared_local[(4)] * compute_d_shared_local[(7)]));\n compute_local[(5)] = (compute_local[(5)] + (compute_shared_local[(5)] * compute_d_shared_local[(0)]));\n compute_local[(13)] = (compute_local[(13)] + (compute_shared_local[(5)] * compute_d_shared_local[(1)]));\n compute_local[(21)] = (compute_local[(21)] + (compute_shared_local[(5)] * compute_d_shared_local[(2)]));\n compute_local[(29)] = (compute_local[(29)] + (compute_shared_local[(5)] * compute_d_shared_local[(3)]));\n compute_local[(37)] = (compute_local[(37)] + (compute_shared_local[(5)] * compute_d_shared_local[(4)]));\n compute_local[(45)] = (compute_local[(45)] + (compute_shared_local[(5)] * compute_d_shared_local[(5)]));\n compute_local[(53)] = (compute_local[(53)] + (compute_shared_local[(5)] * compute_d_shared_local[(6)]));\n compute_local[(61)] = (compute_local[(61)] + (compute_shared_local[(5)] * compute_d_shared_local[(7)]));\n compute_local[(6)] = (compute_local[(6)] + (compute_shared_local[(6)] * compute_d_shared_local[(0)]));\n compute_local[(14)] = (compute_local[(14)] + (compute_shared_local[(6)] * compute_d_shared_local[(1)]));\n compute_local[(22)] = (compute_local[(22)] + (compute_shared_local[(6)] * compute_d_shared_local[(2)]));\n compute_local[(30)] = (compute_local[(30)] + (compute_shared_local[(6)] * compute_d_shared_local[(3)]));\n compute_local[(38)] = (compute_local[(38)] + (compute_shared_local[(6)] * compute_d_shared_local[(4)]));\n compute_local[(46)] = (compute_local[(46)] + (compute_shared_local[(6)] * compute_d_shared_local[(5)]));\n compute_local[(54)] = (compute_local[(54)] + (compute_shared_local[(6)] * compute_d_shared_local[(6)]));\n compute_local[(62)] = (compute_local[(62)] + (compute_shared_local[(6)] * compute_d_shared_local[(7)]));\n compute_local[(7)] = (compute_local[(7)] + (compute_shared_local[(7)] * compute_d_shared_local[(0)]));\n compute_local[(15)] = (compute_local[(15)] + (compute_shared_local[(7)] * compute_d_shared_local[(1)]));\n compute_local[(23)] = (compute_local[(23)] + (compute_shared_local[(7)] * compute_d_shared_local[(2)]));\n compute_local[(31)] = (compute_local[(31)] + (compute_shared_local[(7)] * compute_d_shared_local[(3)]));\n compute_local[(39)] = (compute_local[(39)] + (compute_shared_local[(7)] * compute_d_shared_local[(4)]));\n compute_local[(47)] = (compute_local[(47)] + (compute_shared_local[(7)] * compute_d_shared_local[(5)]));\n compute_local[(55)] = (compute_local[(55)] + (compute_shared_local[(7)] * compute_d_shared_local[(6)]));\n compute_local[(63)] = (compute_local[(63)] + (compute_shared_local[(7)] * compute_d_shared_local[(7)]));\n }\n }\n compute[(((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)))] = max((compute_local[(0)] + bias[(((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 3211264))] = max((compute_local[(8)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 3211264))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 6422528))] = max((compute_local[(16)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 6422528))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 9633792))] = max((compute_local[(24)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 9633792))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845056))] = max((compute_local[(32)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845056))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16056320))] = max((compute_local[(40)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16056320))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 19267584))] = max((compute_local[(48)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 19267584))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 22478848))] = max((compute_local[(56)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 22478848))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 32))] = max((compute_local[(1)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 32))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 3211296))] = max((compute_local[(9)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 3211296))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 6422560))] = max((compute_local[(17)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 6422560))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 9633824))] = max((compute_local[(25)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 9633824))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845088))] = max((compute_local[(33)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845088))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16056352))] = max((compute_local[(41)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16056352))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 19267616))] = max((compute_local[(49)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 19267616))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 22478880))] = max((compute_local[(57)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 22478880))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64))] = max((compute_local[(2)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 64))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 3211328))] = max((compute_local[(10)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 3211328))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 6422592))] = max((compute_local[(18)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 6422592))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 9633856))] = max((compute_local[(26)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 9633856))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845120))] = max((compute_local[(34)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845120))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16056384))] = max((compute_local[(42)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16056384))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 19267648))] = max((compute_local[(50)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 19267648))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 22478912))] = max((compute_local[(58)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 22478912))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 96))] = max((compute_local[(3)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 96))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 3211360))] = max((compute_local[(11)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 3211360))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 6422624))] = max((compute_local[(19)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 6422624))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 9633888))] = max((compute_local[(27)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 9633888))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845152))] = max((compute_local[(35)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845152))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16056416))] = max((compute_local[(43)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16056416))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 19267680))] = max((compute_local[(51)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 19267680))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 22478944))] = max((compute_local[(59)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 22478944))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 128))] = max((compute_local[(4)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 128))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 3211392))] = max((compute_local[(12)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 3211392))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 6422656))] = max((compute_local[(20)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 6422656))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 9633920))] = max((compute_local[(28)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 9633920))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845184))] = max((compute_local[(36)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845184))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16056448))] = max((compute_local[(44)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16056448))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 19267712))] = max((compute_local[(52)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 19267712))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 22478976))] = max((compute_local[(60)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 22478976))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 160))] = max((compute_local[(5)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 160))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 3211424))] = max((compute_local[(13)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 3211424))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 6422688))] = max((compute_local[(21)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 6422688))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 9633952))] = max((compute_local[(29)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 9633952))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845216))] = max((compute_local[(37)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845216))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16056480))] = max((compute_local[(45)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16056480))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 19267744))] = max((compute_local[(53)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 19267744))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 22479008))] = max((compute_local[(61)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 22479008))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 192))] = max((compute_local[(6)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 192))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 3211456))] = max((compute_local[(14)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 3211456))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 6422720))] = max((compute_local[(22)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 6422720))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 9633984))] = max((compute_local[(30)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 9633984))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845248))] = max((compute_local[(38)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845248))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16056512))] = max((compute_local[(46)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16056512))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 19267776))] = max((compute_local[(54)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 19267776))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 22479040))] = max((compute_local[(62)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 22479040))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 224))] = max((compute_local[(7)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 224))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 3211488))] = max((compute_local[(15)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 3211488))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 6422752))] = max((compute_local[(23)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 6422752))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 9634016))] = max((compute_local[(31)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 9634016))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845280))] = max((compute_local[(39)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 12845280))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16056544))] = max((compute_local[(47)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 16056544))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 19267808))] = max((compute_local[(55)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 19267808))]), 0.000000e+00f);\n compute[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 22479072))] = max((compute_local[(63)] + bias[((((((((int)threadIdx.x) >> 5) * 401408) + (((int)blockIdx.x) * 256)) + (((int)threadIdx.x) & 31)) + 22479072))]), 0.000000e+00f);\n}\n", "gridDim": [1568, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Dot_[128,2048]_[2048,1000]_[128,1000].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Dot_[128,2048]_[2048,1000]_[128,1000].json new file mode 100644 index 000000000..2194b7b4b --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Dot_[128,2048]_[2048,1000]_[128,1000].json @@ -0,0 +1 @@ +{"parameters": {"arg0_shape": [128, 2048], "arg1_shape": [2048, 1000], "out_shape": [128, 1000], "transpose_A": false, "transpose_B": false}, "op_type": "Dot", "tvm_func_name": "roller_Dot__128_2048___2048_1000___128_1000_", "code": "extern \"C\" __global__ void roller_Dot__128_2048___2048_1000___128_1000_(float* __restrict__ A, float* __restrict__ B, float* __restrict__ compute) {\n float compute_local[4];\n __shared__ float A_shared[1056];\n __shared__ float B_shared[1024];\n float A_shared_local[2];\n float B_shared_local[2];\n compute_local[(0)] = 0.000000e+00f;\n compute_local[(2)] = 0.000000e+00f;\n compute_local[(1)] = 0.000000e+00f;\n compute_local[(3)] = 0.000000e+00f;\n for (int k_outer = 0; k_outer < 64; ++k_outer) {\n __syncthreads();\n A_shared[((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)))] = A[((((((((int)blockIdx.x) >> 5) * 65536) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 264))] = A[(((((((((int)blockIdx.x) >> 5) * 65536) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 16384))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 528))] = A[(((((((((int)blockIdx.x) >> 5) * 65536) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 32768))];\n A_shared[(((((((int)threadIdx.x) >> 5) * 33) + (((int)threadIdx.x) & 31)) + 792))] = A[(((((((((int)blockIdx.x) >> 5) * 65536) + ((((int)threadIdx.x) >> 5) * 2048)) + (k_outer * 32)) + (((int)threadIdx.x) & 31)) + 49152))];\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 31)) < 1000) {\n B_shared[(((int)threadIdx.x))] = B[(((((k_outer * 32000) + ((((int)threadIdx.x) >> 5) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 31)))];\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 31)) < 1000) {\n B_shared[((((int)threadIdx.x) + 256))] = B[((((((k_outer * 32000) + ((((int)threadIdx.x) >> 5) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 31)) + 8000))];\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 31)) < 1000) {\n B_shared[((((int)threadIdx.x) + 512))] = B[((((((k_outer * 32000) + ((((int)threadIdx.x) >> 5) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 31)) + 16000))];\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 31)) < 1000) {\n B_shared[((((int)threadIdx.x) + 768))] = B[((((((k_outer * 32000) + ((((int)threadIdx.x) >> 5) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 31)) + 24000))];\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 32; ++k_inner_outer) {\n A_shared_local[(0)] = A_shared[((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer))];\n A_shared_local[(1)] = A_shared[(((((((int)threadIdx.x) >> 4) * 33) + k_inner_outer) + 528))];\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 15)) < 1000) {\n B_shared_local[(0)] = B_shared[(((k_inner_outer * 32) + (((int)threadIdx.x) & 15)))];\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 15)) < 984) {\n B_shared_local[(1)] = B_shared[((((k_inner_outer * 32) + (((int)threadIdx.x) & 15)) + 16))];\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 15)) < 1000) {\n compute_local[(0)] = (compute_local[(0)] + (A_shared_local[(0)] * B_shared_local[(0)]));\n compute_local[(2)] = (compute_local[(2)] + (A_shared_local[(1)] * B_shared_local[(0)]));\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 15)) < 984) {\n compute_local[(1)] = (compute_local[(1)] + (A_shared_local[(0)] * B_shared_local[(1)]));\n compute_local[(3)] = (compute_local[(3)] + (A_shared_local[(1)] * B_shared_local[(1)]));\n }\n }\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 15)) < 1000) {\n compute[((((((((int)blockIdx.x) >> 5) * 32000) + ((((int)threadIdx.x) >> 4) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 15)))] = compute_local[(0)];\n compute[(((((((((int)blockIdx.x) >> 5) * 32000) + ((((int)threadIdx.x) >> 4) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 15)) + 16000))] = compute_local[(2)];\n }\n if ((((((int)blockIdx.x) & 31) * 32) + (((int)threadIdx.x) & 15)) < 984) {\n compute[(((((((((int)blockIdx.x) >> 5) * 32000) + ((((int)threadIdx.x) >> 4) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 15)) + 16))] = compute_local[(1)];\n compute[(((((((((int)blockIdx.x) >> 5) * 32000) + ((((int)threadIdx.x) >> 4) * 1000)) + ((((int)blockIdx.x) & 31) * 32)) + (((int)threadIdx.x) & 15)) + 16016))] = compute_local[(3)];\n }\n}\n", "gridDim": [128, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_MaxPool_[128,64,112,112]_[128,64,56,56].json b/src/tools/nnfusion/kernel_db/roller_res/roller_MaxPool_[128,64,112,112]_[128,64,56,56].json new file mode 100644 index 000000000..03d517cb3 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_MaxPool_[128,64,112,112]_[128,64,56,56].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 64, 112, 112], "output_shape": [128, 64, 56, 56], "window_shape": [3, 3], "window_stride": [2, 2], "padding_below": [0, 0]}, "op_type": "MaxPool", "tvm_func_name": "roller_MaxPool__128_64_112_112___128_64_56_56_", "code": "extern \"C\" __global__ void roller_MaxPool__128_64_112_112___128_64_56_56_(float* __restrict__ Pool2d, float* __restrict__ data) {\n __shared__ float compute_shared[999];\n if ((((((int)blockIdx.x) % 14) * 4) + (((int)threadIdx.x) >> 6)) < 55) {\n if ((((int)threadIdx.x) & 63) < 55) {\n Pool2d[((((((((int)blockIdx.x) / 14) * 3025) + ((((int)blockIdx.x) % 14) * 220)) + ((((int)threadIdx.x) >> 6) * 55)) + (((int)threadIdx.x) & 63)))] = -3.402823e+38f;\n }\n }\n compute_shared[(((int)threadIdx.x))] = data[((((((int)blockIdx.x) * 896) + ((((int)threadIdx.x) / 111) * 112)) + (((int)threadIdx.x) % 111)))];\n compute_shared[((((int)threadIdx.x) + 256))] = data[((((((int)blockIdx.x) * 896) + (((((int)threadIdx.x) + 256) / 111) * 112)) + ((((int)threadIdx.x) + 34) % 111)))];\n compute_shared[((((int)threadIdx.x) + 512))] = data[((((((int)blockIdx.x) * 896) + (((((int)threadIdx.x) + 512) / 111) * 112)) + ((((int)threadIdx.x) + 68) % 111)))];\n if (((int)threadIdx.x) < 231) {\n compute_shared[((((int)threadIdx.x) + 768))] = (((((((int)blockIdx.x) % 14) * 8) + ((((int)threadIdx.x) + 768) / 111)) < 112) ? data[((((((int)blockIdx.x) * 896) + (((((int)threadIdx.x) + 768) / 111) * 112)) + ((((int)threadIdx.x) + 102) % 111)))] : 0.000000e+00f);\n }\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 9; ++k_inner_outer) {\n if ((((((int)blockIdx.x) % 14) * 4) + (((int)threadIdx.x) >> 6)) < 55) {\n if ((((int)threadIdx.x) & 63) < 55) {\n Pool2d[((((((((int)blockIdx.x) / 14) * 3025) + ((((int)blockIdx.x) % 14) * 220)) + ((((int)threadIdx.x) >> 6) * 55)) + (((int)threadIdx.x) & 63)))] = max(Pool2d[((((((((int)blockIdx.x) / 14) * 3025) + ((((int)blockIdx.x) % 14) * 220)) + ((((int)threadIdx.x) >> 6) * 55)) + (((int)threadIdx.x) & 63)))], compute_shared[((((((((int)threadIdx.x) >> 6) * 222) + ((k_inner_outer / 3) * 111)) + ((((int)threadIdx.x) & 63) * 2)) + (k_inner_outer % 3)))]);\n }\n }\n }\n}\n", "gridDim": [114688, 1, 1], "blockDim": [256, 1, 1]} \ No newline at end of file diff --git a/src/tools/nnfusion/kernel_db/roller_res/roller_Sum_[128,2048,7,7]_[128,2048].json b/src/tools/nnfusion/kernel_db/roller_res/roller_Sum_[128,2048,7,7]_[128,2048].json new file mode 100644 index 000000000..d95c42383 --- /dev/null +++ b/src/tools/nnfusion/kernel_db/roller_res/roller_Sum_[128,2048,7,7]_[128,2048].json @@ -0,0 +1 @@ +{"parameters": {"input_shape": [128, 2048, 7, 7], "output_shape": [128, 2048], "reduction_axis": [2, 3]}, "op_type": "Sum", "tvm_func_name": "roller_Sum__128_2048_7_7___128_2048_", "code": "extern \"C\" __global__ void roller_Sum__128_2048_7_7___128_2048_(float* __restrict__ A, float* __restrict__ compute) {\n float compute_local[1];\n __shared__ float A_shared[6272];\n float A_shared_local[1];\n compute_local[(0)] = 0.000000e+00f;\n A_shared[(((int)threadIdx.x))] = A[(((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)))];\n A_shared[((((int)threadIdx.x) + 128))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 128))];\n A_shared[((((int)threadIdx.x) + 256))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 256))];\n A_shared[((((int)threadIdx.x) + 384))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 384))];\n A_shared[((((int)threadIdx.x) + 512))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 512))];\n A_shared[((((int)threadIdx.x) + 640))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 640))];\n A_shared[((((int)threadIdx.x) + 768))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 768))];\n A_shared[((((int)threadIdx.x) + 896))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 896))];\n A_shared[((((int)threadIdx.x) + 1024))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 1024))];\n A_shared[((((int)threadIdx.x) + 1152))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 1152))];\n A_shared[((((int)threadIdx.x) + 1280))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 1280))];\n A_shared[((((int)threadIdx.x) + 1408))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 1408))];\n A_shared[((((int)threadIdx.x) + 1536))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 1536))];\n A_shared[((((int)threadIdx.x) + 1664))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 1664))];\n A_shared[((((int)threadIdx.x) + 1792))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 1792))];\n A_shared[((((int)threadIdx.x) + 1920))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 1920))];\n A_shared[((((int)threadIdx.x) + 2048))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 2048))];\n A_shared[((((int)threadIdx.x) + 2176))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 2176))];\n A_shared[((((int)threadIdx.x) + 2304))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 2304))];\n A_shared[((((int)threadIdx.x) + 2432))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 2432))];\n A_shared[((((int)threadIdx.x) + 2560))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 2560))];\n A_shared[((((int)threadIdx.x) + 2688))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 2688))];\n A_shared[((((int)threadIdx.x) + 2816))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 2816))];\n A_shared[((((int)threadIdx.x) + 2944))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 2944))];\n A_shared[((((int)threadIdx.x) + 3072))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 3072))];\n A_shared[((((int)threadIdx.x) + 3200))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 3200))];\n A_shared[((((int)threadIdx.x) + 3328))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 3328))];\n A_shared[((((int)threadIdx.x) + 3456))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 3456))];\n A_shared[((((int)threadIdx.x) + 3584))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 3584))];\n A_shared[((((int)threadIdx.x) + 3712))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 3712))];\n A_shared[((((int)threadIdx.x) + 3840))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 3840))];\n A_shared[((((int)threadIdx.x) + 3968))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 3968))];\n A_shared[((((int)threadIdx.x) + 4096))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 4096))];\n A_shared[((((int)threadIdx.x) + 4224))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 4224))];\n A_shared[((((int)threadIdx.x) + 4352))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 4352))];\n A_shared[((((int)threadIdx.x) + 4480))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 4480))];\n A_shared[((((int)threadIdx.x) + 4608))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 4608))];\n A_shared[((((int)threadIdx.x) + 4736))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 4736))];\n A_shared[((((int)threadIdx.x) + 4864))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 4864))];\n A_shared[((((int)threadIdx.x) + 4992))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 4992))];\n A_shared[((((int)threadIdx.x) + 5120))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 5120))];\n A_shared[((((int)threadIdx.x) + 5248))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 5248))];\n A_shared[((((int)threadIdx.x) + 5376))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 5376))];\n A_shared[((((int)threadIdx.x) + 5504))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 5504))];\n A_shared[((((int)threadIdx.x) + 5632))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 5632))];\n A_shared[((((int)threadIdx.x) + 5760))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 5760))];\n A_shared[((((int)threadIdx.x) + 5888))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 5888))];\n A_shared[((((int)threadIdx.x) + 6016))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 6016))];\n A_shared[((((int)threadIdx.x) + 6144))] = A[((((((int)blockIdx.x) * 6272) + ((int)threadIdx.x)) + 6144))];\n __syncthreads();\n for (int k_inner_outer = 0; k_inner_outer < 49; ++k_inner_outer) {\n A_shared_local[(0)] = A_shared[(((((int)threadIdx.x) * 49) + k_inner_outer))];\n compute_local[(0)] = (compute_local[(0)] + A_shared_local[(0)]);\n }\n compute[(((((int)blockIdx.x) * 128) + ((int)threadIdx.x)))] = compute_local[(0)];\n}\n", "gridDim": [2048, 1, 1], "blockDim": [128, 1, 1]} \ No newline at end of file From 5a1cc29ae309c86b781c490779b1ec0c72e7ff65 Mon Sep 17 00:00:00 2001 From: cjkkkk Date: Fri, 15 Apr 2022 13:37:29 +0000 Subject: [PATCH 24/24] fix default_kernel1 entry --- src/tools/nnfusion/kernel_db/parse_code.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/tools/nnfusion/kernel_db/parse_code.py b/src/tools/nnfusion/kernel_db/parse_code.py index 9ff655c32..a9414ec06 100644 --- a/src/tools/nnfusion/kernel_db/parse_code.py +++ b/src/tools/nnfusion/kernel_db/parse_code.py @@ -87,11 +87,14 @@ if match: lb = match.group() line = line.replace(lb, "") - kernel_name = re.search("void .*_kernel0", line).group() - # print(kernel_name) - line = line.replace(kernel_name, "void " + tvm_func_name) - code += line - flag = True + try: + kernel_name = re.search("void .*_kernel0", line).group() + # print(kernel_name) + line = line.replace(kernel_name, "void " + tvm_func_name) + code += line + flag = True + except: + pass if "dim3 grid(" in line: line = line.split("(")[1].split(")")[0].split(",") for i in line: