From ea5c5a86fb2d353a90eb1300824b6529888b26d8 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 14 Jun 2023 08:47:30 -0700 Subject: [PATCH 01/41] merge --- hls4ml/templates/vivado/build_prj.tcl | 2 +- .../templates/vivado/nnet_utils/nnet_common.h | 1 + .../templates/vivado/nnet_utils/nnet_conv1d.h | 16 +- .../vivado/nnet_utils/nnet_conv1d_latency.h | 221 ++++++++++++++++++ .../vivado/nnet_utils/nnet_conv_stream.h | 2 - 5 files changed, 237 insertions(+), 5 deletions(-) diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl index d34337c573..6383b910ca 100644 --- a/hls4ml/templates/vivado/build_prj.tcl +++ b/hls4ml/templates/vivado/build_prj.tcl @@ -161,7 +161,7 @@ if {$opt(reset)} { } else { open_solution "solution1" } -catch {config_array_partition -maximum_size 4096} +catch {config_array_partition -maximum_size 8192} config_compile -name_max_length 80 set_part $part config_schedule -enable_dsp_full_reg=false diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h index fed0395a1a..b6582e1406 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h @@ -24,6 +24,7 @@ namespace nnet { // Common type definitions enum io_type { io_parallel = 0, io_stream }; enum strategy { latency, resource }; +enum class conv_implementation { linebuffer=0, encoded=1, pointwise=2}; /* --- * Balanced tree reduce implementation. diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index e2e0211b49..c2990ea97a 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -53,9 +53,21 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], #pragma HLS INLINE region - // Nothing special to be done for io_parallel implementation if (CONFIG_T::strategy == nnet::latency) { - conv_1d_latency_cl(data, res, weights, biases); + if (CONFIG_T::implementation == conv_implementation::pointwise){ + // Use pointwise unrolled implementation + if (CONFIG_T::reuse_factor > 1 && CONFIG_T::reuse_factor <= 120) { + pointwise_conv_1d_latency_cl_split_by_rf(data, res, weights, biases); + } + else { + assert(CONFIG_T::reuse_factor == 1); + pointwise_conv_1d_latency_cl(data, res, weights, biases); + } + } + else { + // Use standard unrolled implementation + conv_1d_resource_cl(data, res, weights, biases); + } } else { conv_1d_resource_cl(data, res, weights, biases); } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 0d9afb10cb..8549ae9add 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -84,5 +84,226 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } } +template +void pointwise_conv_1d_latency_cl( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan/CONFIG_T::reuse_factor], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + assert(CONFIG_T::filt_width == 1); + + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan/CONFIG_T::reuse_factor]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width/CONFIG_T::reuse_factor][CONFIG_T::n_filt]; + + #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 + #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + #pragma HLS function_instantiate variable=weights,biases + + // Parallel mode + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 + + // Limit multipliers to control parallelization + //const int multiplier_limit = compute_multiplier_limit(weights); + //#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + + // Convolve, saving all multiplication results to accumulate later + ConvOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { + ConvFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc; + int index_weight = cc*CONFIG_T::n_filt + ff; + int index_data = (ii*CONFIG_T::stride_width-CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + + if((ii*CONFIG_T::stride_width) < CONFIG_T::pad_left || (ii*CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)){ + mult[index_mult] = 0; + } + else { + mult[index_mult] = data[index_data] * weights[index_weight]; + } + }//end channel loop + }//end filter loop + }//end output loop + + + // Initialize accumulator with input biases + for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { + for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + acc[ii][ff]=biases[ff]; + } + } + + + // Accumulate multiplication result + AccumOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { + AccumFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + //Do "dot product" sum within filter and sum over channels + AccumChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc; + acc[ii][ff] += mult[index_mult]; + }//end channel loop + }//end filter loop + }//end output loop + + + // Cast to "res_t" type + for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { + for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); + } + } +} + +template void pointwise_conv_1d_latency_cl_split_by_rf( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) +{ + + data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor]; + #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0 + res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width*CONFIG_T::n_filt/CONFIG_T::reuse_factor]; + #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0 + + for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + for(int ii = 0; ii < CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor; ii++) { + #pragma HLS UNROLL + data_tmp[jj][ii] = data[jj*CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor+ii]; + } + } + + pointwise_conv_1d_latency_cl(data_tmp[0], res_tmp[0], weights, biases); + pointwise_conv_1d_latency_cl(data_tmp[1], res_tmp[1], weights, biases); + if (CONFIG_T::reuse_factor > 2) pointwise_conv_1d_latency_cl(data_tmp[2], res_tmp[2], weights, biases); + if (CONFIG_T::reuse_factor > 3) pointwise_conv_1d_latency_cl(data_tmp[3], res_tmp[3], weights, biases); + if (CONFIG_T::reuse_factor > 4) pointwise_conv_1d_latency_cl(data_tmp[4], res_tmp[4], weights, biases); + if (CONFIG_T::reuse_factor > 5) pointwise_conv_1d_latency_cl(data_tmp[5], res_tmp[5], weights, biases); + if (CONFIG_T::reuse_factor > 6) pointwise_conv_1d_latency_cl(data_tmp[6], res_tmp[6], weights, biases); + if (CONFIG_T::reuse_factor > 7) pointwise_conv_1d_latency_cl(data_tmp[7], res_tmp[7], weights, biases); + if (CONFIG_T::reuse_factor > 8) pointwise_conv_1d_latency_cl(data_tmp[8], res_tmp[8], weights, biases); + if (CONFIG_T::reuse_factor > 9) pointwise_conv_1d_latency_cl(data_tmp[9], res_tmp[9], weights, biases); + if (CONFIG_T::reuse_factor > 10) pointwise_conv_1d_latency_cl(data_tmp[10], res_tmp[10], weights, biases); + if (CONFIG_T::reuse_factor > 11) pointwise_conv_1d_latency_cl(data_tmp[11], res_tmp[11], weights, biases); + if (CONFIG_T::reuse_factor > 12) pointwise_conv_1d_latency_cl(data_tmp[12], res_tmp[12], weights, biases); + if (CONFIG_T::reuse_factor > 13) pointwise_conv_1d_latency_cl(data_tmp[13], res_tmp[13], weights, biases); + if (CONFIG_T::reuse_factor > 14) pointwise_conv_1d_latency_cl(data_tmp[14], res_tmp[14], weights, biases); + if (CONFIG_T::reuse_factor > 15) pointwise_conv_1d_latency_cl(data_tmp[15], res_tmp[15], weights, biases); + if (CONFIG_T::reuse_factor > 16) pointwise_conv_1d_latency_cl(data_tmp[16], res_tmp[16], weights, biases); + if (CONFIG_T::reuse_factor > 17) pointwise_conv_1d_latency_cl(data_tmp[17], res_tmp[17], weights, biases); + if (CONFIG_T::reuse_factor > 18) pointwise_conv_1d_latency_cl(data_tmp[18], res_tmp[18], weights, biases); + if (CONFIG_T::reuse_factor > 19) pointwise_conv_1d_latency_cl(data_tmp[19], res_tmp[19], weights, biases); + if (CONFIG_T::reuse_factor > 20) pointwise_conv_1d_latency_cl(data_tmp[20], res_tmp[20], weights, biases); + if (CONFIG_T::reuse_factor > 21) pointwise_conv_1d_latency_cl(data_tmp[21], res_tmp[21], weights, biases); + if (CONFIG_T::reuse_factor > 22) pointwise_conv_1d_latency_cl(data_tmp[22], res_tmp[22], weights, biases); + if (CONFIG_T::reuse_factor > 23) pointwise_conv_1d_latency_cl(data_tmp[23], res_tmp[23], weights, biases); + if (CONFIG_T::reuse_factor > 24) pointwise_conv_1d_latency_cl(data_tmp[24], res_tmp[24], weights, biases); + if (CONFIG_T::reuse_factor > 25) pointwise_conv_1d_latency_cl(data_tmp[25], res_tmp[25], weights, biases); + if (CONFIG_T::reuse_factor > 26) pointwise_conv_1d_latency_cl(data_tmp[26], res_tmp[26], weights, biases); + if (CONFIG_T::reuse_factor > 27) pointwise_conv_1d_latency_cl(data_tmp[27], res_tmp[27], weights, biases); + if (CONFIG_T::reuse_factor > 28) pointwise_conv_1d_latency_cl(data_tmp[28], res_tmp[28], weights, biases); + if (CONFIG_T::reuse_factor > 29) pointwise_conv_1d_latency_cl(data_tmp[29], res_tmp[29], weights, biases); + if (CONFIG_T::reuse_factor > 30) pointwise_conv_1d_latency_cl(data_tmp[30], res_tmp[30], weights, biases); + if (CONFIG_T::reuse_factor > 31) pointwise_conv_1d_latency_cl(data_tmp[31], res_tmp[31], weights, biases); + if (CONFIG_T::reuse_factor > 32) pointwise_conv_1d_latency_cl(data_tmp[32], res_tmp[32], weights, biases); + if (CONFIG_T::reuse_factor > 33) pointwise_conv_1d_latency_cl(data_tmp[33], res_tmp[33], weights, biases); + if (CONFIG_T::reuse_factor > 34) pointwise_conv_1d_latency_cl(data_tmp[34], res_tmp[34], weights, biases); + if (CONFIG_T::reuse_factor > 35) pointwise_conv_1d_latency_cl(data_tmp[35], res_tmp[35], weights, biases); + if (CONFIG_T::reuse_factor > 36) pointwise_conv_1d_latency_cl(data_tmp[36], res_tmp[36], weights, biases); + if (CONFIG_T::reuse_factor > 37) pointwise_conv_1d_latency_cl(data_tmp[37], res_tmp[37], weights, biases); + if (CONFIG_T::reuse_factor > 38) pointwise_conv_1d_latency_cl(data_tmp[38], res_tmp[38], weights, biases); + if (CONFIG_T::reuse_factor > 39) pointwise_conv_1d_latency_cl(data_tmp[39], res_tmp[39], weights, biases); + if (CONFIG_T::reuse_factor > 40) pointwise_conv_1d_latency_cl(data_tmp[40], res_tmp[40], weights, biases); + if (CONFIG_T::reuse_factor > 41) pointwise_conv_1d_latency_cl(data_tmp[41], res_tmp[41], weights, biases); + if (CONFIG_T::reuse_factor > 42) pointwise_conv_1d_latency_cl(data_tmp[42], res_tmp[42], weights, biases); + if (CONFIG_T::reuse_factor > 43) pointwise_conv_1d_latency_cl(data_tmp[43], res_tmp[43], weights, biases); + if (CONFIG_T::reuse_factor > 44) pointwise_conv_1d_latency_cl(data_tmp[44], res_tmp[44], weights, biases); + if (CONFIG_T::reuse_factor > 45) pointwise_conv_1d_latency_cl(data_tmp[45], res_tmp[45], weights, biases); + if (CONFIG_T::reuse_factor > 46) pointwise_conv_1d_latency_cl(data_tmp[46], res_tmp[45], weights, biases); + if (CONFIG_T::reuse_factor > 47) pointwise_conv_1d_latency_cl(data_tmp[47], res_tmp[47], weights, biases); + if (CONFIG_T::reuse_factor > 48) pointwise_conv_1d_latency_cl(data_tmp[48], res_tmp[48], weights, biases); + if (CONFIG_T::reuse_factor > 49) pointwise_conv_1d_latency_cl(data_tmp[49], res_tmp[49], weights, biases); + if (CONFIG_T::reuse_factor > 50) pointwise_conv_1d_latency_cl(data_tmp[50], res_tmp[50], weights, biases); + if (CONFIG_T::reuse_factor > 51) pointwise_conv_1d_latency_cl(data_tmp[51], res_tmp[51], weights, biases); + if (CONFIG_T::reuse_factor > 52) pointwise_conv_1d_latency_cl(data_tmp[52], res_tmp[52], weights, biases); + if (CONFIG_T::reuse_factor > 53) pointwise_conv_1d_latency_cl(data_tmp[53], res_tmp[53], weights, biases); + if (CONFIG_T::reuse_factor > 54) pointwise_conv_1d_latency_cl(data_tmp[54], res_tmp[54], weights, biases); + if (CONFIG_T::reuse_factor > 55) pointwise_conv_1d_latency_cl(data_tmp[55], res_tmp[55], weights, biases); + if (CONFIG_T::reuse_factor > 56) pointwise_conv_1d_latency_cl(data_tmp[56], res_tmp[55], weights, biases); + if (CONFIG_T::reuse_factor > 57) pointwise_conv_1d_latency_cl(data_tmp[57], res_tmp[57], weights, biases); + if (CONFIG_T::reuse_factor > 58) pointwise_conv_1d_latency_cl(data_tmp[58], res_tmp[58], weights, biases); + if (CONFIG_T::reuse_factor > 59) pointwise_conv_1d_latency_cl(data_tmp[59], res_tmp[59], weights, biases); + if (CONFIG_T::reuse_factor > 60) pointwise_conv_1d_latency_cl(data_tmp[60], res_tmp[60], weights, biases); + if (CONFIG_T::reuse_factor > 61) pointwise_conv_1d_latency_cl(data_tmp[61], res_tmp[61], weights, biases); + if (CONFIG_T::reuse_factor > 62) pointwise_conv_1d_latency_cl(data_tmp[62], res_tmp[62], weights, biases); + if (CONFIG_T::reuse_factor > 63) pointwise_conv_1d_latency_cl(data_tmp[63], res_tmp[63], weights, biases); + if (CONFIG_T::reuse_factor > 64) pointwise_conv_1d_latency_cl(data_tmp[64], res_tmp[64], weights, biases); + if (CONFIG_T::reuse_factor > 65) pointwise_conv_1d_latency_cl(data_tmp[65], res_tmp[65], weights, biases); + if (CONFIG_T::reuse_factor > 66) pointwise_conv_1d_latency_cl(data_tmp[66], res_tmp[66], weights, biases); + if (CONFIG_T::reuse_factor > 67) pointwise_conv_1d_latency_cl(data_tmp[67], res_tmp[67], weights, biases); + if (CONFIG_T::reuse_factor > 68) pointwise_conv_1d_latency_cl(data_tmp[68], res_tmp[68], weights, biases); + if (CONFIG_T::reuse_factor > 69) pointwise_conv_1d_latency_cl(data_tmp[69], res_tmp[69], weights, biases); + if (CONFIG_T::reuse_factor > 70) pointwise_conv_1d_latency_cl(data_tmp[70], res_tmp[70], weights, biases); + if (CONFIG_T::reuse_factor > 71) pointwise_conv_1d_latency_cl(data_tmp[71], res_tmp[71], weights, biases); + if (CONFIG_T::reuse_factor > 72) pointwise_conv_1d_latency_cl(data_tmp[72], res_tmp[72], weights, biases); + if (CONFIG_T::reuse_factor > 73) pointwise_conv_1d_latency_cl(data_tmp[73], res_tmp[73], weights, biases); + if (CONFIG_T::reuse_factor > 74) pointwise_conv_1d_latency_cl(data_tmp[74], res_tmp[74], weights, biases); + if (CONFIG_T::reuse_factor > 75) pointwise_conv_1d_latency_cl(data_tmp[75], res_tmp[75], weights, biases); + if (CONFIG_T::reuse_factor > 76) pointwise_conv_1d_latency_cl(data_tmp[76], res_tmp[76], weights, biases); + if (CONFIG_T::reuse_factor > 77) pointwise_conv_1d_latency_cl(data_tmp[77], res_tmp[77], weights, biases); + if (CONFIG_T::reuse_factor > 78) pointwise_conv_1d_latency_cl(data_tmp[78], res_tmp[78], weights, biases); + if (CONFIG_T::reuse_factor > 79) pointwise_conv_1d_latency_cl(data_tmp[79], res_tmp[79], weights, biases); + if (CONFIG_T::reuse_factor > 80) pointwise_conv_1d_latency_cl(data_tmp[80], res_tmp[80], weights, biases); + if (CONFIG_T::reuse_factor > 81) pointwise_conv_1d_latency_cl(data_tmp[81], res_tmp[81], weights, biases); + if (CONFIG_T::reuse_factor > 82) pointwise_conv_1d_latency_cl(data_tmp[82], res_tmp[82], weights, biases); + if (CONFIG_T::reuse_factor > 83) pointwise_conv_1d_latency_cl(data_tmp[83], res_tmp[83], weights, biases); + if (CONFIG_T::reuse_factor > 84) pointwise_conv_1d_latency_cl(data_tmp[84], res_tmp[84], weights, biases); + if (CONFIG_T::reuse_factor > 85) pointwise_conv_1d_latency_cl(data_tmp[85], res_tmp[85], weights, biases); + if (CONFIG_T::reuse_factor > 86) pointwise_conv_1d_latency_cl(data_tmp[86], res_tmp[86], weights, biases); + if (CONFIG_T::reuse_factor > 87) pointwise_conv_1d_latency_cl(data_tmp[87], res_tmp[87], weights, biases); + if (CONFIG_T::reuse_factor > 88) pointwise_conv_1d_latency_cl(data_tmp[88], res_tmp[88], weights, biases); + if (CONFIG_T::reuse_factor > 89) pointwise_conv_1d_latency_cl(data_tmp[89], res_tmp[89], weights, biases); + if (CONFIG_T::reuse_factor > 90) pointwise_conv_1d_latency_cl(data_tmp[90], res_tmp[90], weights, biases); + if (CONFIG_T::reuse_factor > 91) pointwise_conv_1d_latency_cl(data_tmp[91], res_tmp[91], weights, biases); + if (CONFIG_T::reuse_factor > 92) pointwise_conv_1d_latency_cl(data_tmp[92], res_tmp[92], weights, biases); + if (CONFIG_T::reuse_factor > 93) pointwise_conv_1d_latency_cl(data_tmp[93], res_tmp[93], weights, biases); + if (CONFIG_T::reuse_factor > 94) pointwise_conv_1d_latency_cl(data_tmp[94], res_tmp[94], weights, biases); + if (CONFIG_T::reuse_factor > 95) pointwise_conv_1d_latency_cl(data_tmp[95], res_tmp[95], weights, biases); + if (CONFIG_T::reuse_factor > 96) pointwise_conv_1d_latency_cl(data_tmp[96], res_tmp[96], weights, biases); + if (CONFIG_T::reuse_factor > 97) pointwise_conv_1d_latency_cl(data_tmp[97], res_tmp[97], weights, biases); + if (CONFIG_T::reuse_factor > 98) pointwise_conv_1d_latency_cl(data_tmp[98], res_tmp[98], weights, biases); + if (CONFIG_T::reuse_factor > 99) pointwise_conv_1d_latency_cl(data_tmp[99], res_tmp[99], weights, biases); + if (CONFIG_T::reuse_factor > 100) pointwise_conv_1d_latency_cl(data_tmp[100], res_tmp[100], weights, biases); + if (CONFIG_T::reuse_factor > 101) pointwise_conv_1d_latency_cl(data_tmp[101], res_tmp[101], weights, biases); + if (CONFIG_T::reuse_factor > 102) pointwise_conv_1d_latency_cl(data_tmp[102], res_tmp[102], weights, biases); + if (CONFIG_T::reuse_factor > 103) pointwise_conv_1d_latency_cl(data_tmp[103], res_tmp[103], weights, biases); + if (CONFIG_T::reuse_factor > 104) pointwise_conv_1d_latency_cl(data_tmp[104], res_tmp[104], weights, biases); + if (CONFIG_T::reuse_factor > 105) pointwise_conv_1d_latency_cl(data_tmp[105], res_tmp[105], weights, biases); + if (CONFIG_T::reuse_factor > 106) pointwise_conv_1d_latency_cl(data_tmp[106], res_tmp[106], weights, biases); + if (CONFIG_T::reuse_factor > 107) pointwise_conv_1d_latency_cl(data_tmp[107], res_tmp[107], weights, biases); + if (CONFIG_T::reuse_factor > 108) pointwise_conv_1d_latency_cl(data_tmp[108], res_tmp[108], weights, biases); + if (CONFIG_T::reuse_factor > 109) pointwise_conv_1d_latency_cl(data_tmp[109], res_tmp[109], weights, biases); + if (CONFIG_T::reuse_factor > 110) pointwise_conv_1d_latency_cl(data_tmp[110], res_tmp[110], weights, biases); + if (CONFIG_T::reuse_factor > 111) pointwise_conv_1d_latency_cl(data_tmp[111], res_tmp[111], weights, biases); + if (CONFIG_T::reuse_factor > 112) pointwise_conv_1d_latency_cl(data_tmp[112], res_tmp[112], weights, biases); + if (CONFIG_T::reuse_factor > 113) pointwise_conv_1d_latency_cl(data_tmp[113], res_tmp[113], weights, biases); + if (CONFIG_T::reuse_factor > 114) pointwise_conv_1d_latency_cl(data_tmp[114], res_tmp[114], weights, biases); + if (CONFIG_T::reuse_factor > 115) pointwise_conv_1d_latency_cl(data_tmp[115], res_tmp[115], weights, biases); + if (CONFIG_T::reuse_factor > 116) pointwise_conv_1d_latency_cl(data_tmp[116], res_tmp[116], weights, biases); + if (CONFIG_T::reuse_factor > 117) pointwise_conv_1d_latency_cl(data_tmp[117], res_tmp[117], weights, biases); + if (CONFIG_T::reuse_factor > 118) pointwise_conv_1d_latency_cl(data_tmp[118], res_tmp[118], weights, biases); + if (CONFIG_T::reuse_factor > 119) pointwise_conv_1d_latency_cl(data_tmp[119], res_tmp[119], weights, biases); + + for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + for(int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor; ii++) { + #pragma HLS UNROLL + res[jj*CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii]; + } + } +} + } // namespace nnet #endif diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h index 7bd47442f6..b763938cb3 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h @@ -8,8 +8,6 @@ namespace nnet { -enum class conv_implementation { linebuffer = 0, encoded = 1 }; - // ************************************************* // Encoded Implementation (Vlad's) // ************************************************* From 6849e0b4d0a1b352cac1d61870273882dc112705 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Thu, 22 Dec 2022 16:21:25 -0600 Subject: [PATCH 02/41] add pointwise --- hls4ml/backends/vivado/vivado_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 1d4c96d982..4dab5f5c18 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -72,7 +72,7 @@ def _register_layer_attributes(self): for layer in cnn_layers: attrs = self.attribute_map.get(layer, []) # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer')) - attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer')) + attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer')) self.attribute_map[layer] = attrs def _register_flows(self): From 0244b666652e2667c8df72c134f9abd94c731685 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sat, 25 Mar 2023 18:29:44 -0700 Subject: [PATCH 03/41] latency --- hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index c2990ea97a..e2dee3485a 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -66,7 +66,7 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } else { // Use standard unrolled implementation - conv_1d_resource_cl(data, res, weights, biases); + conv_1d_latency_cl(data, res, weights, biases); } } else { conv_1d_resource_cl(data, res, weights, biases); From 3ae7752e70dc43d0687b39a90d7c4d0fd6f9b797 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sat, 25 Mar 2023 18:56:58 -0700 Subject: [PATCH 04/41] unroll --- hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 8549ae9add..4179c1dde8 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -104,6 +104,7 @@ void pointwise_conv_1d_latency_cl( // Parallel mode #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + #pragma HLS ARRAY_PARTITION variable=weights complete dim=0 #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization @@ -114,6 +115,7 @@ void pointwise_conv_1d_latency_cl( ConvOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { ConvFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { ConvChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) { + #pragma HLS UNROLL int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc; int index_weight = cc*CONFIG_T::n_filt + ff; int index_data = (ii*CONFIG_T::stride_width-CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; @@ -132,6 +134,7 @@ void pointwise_conv_1d_latency_cl( // Initialize accumulator with input biases for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL acc[ii][ff]=biases[ff]; } } @@ -152,6 +155,7 @@ void pointwise_conv_1d_latency_cl( // Cast to "res_t" type for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); } } @@ -169,7 +173,9 @@ template void pointwise_conv_1d_la res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width*CONFIG_T::n_filt/CONFIG_T::reuse_factor]; #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0 + RFInputLoop: for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + InnerInputLoop: for(int ii = 0; ii < CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL data_tmp[jj][ii] = data[jj*CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor+ii]; @@ -297,7 +303,9 @@ template void pointwise_conv_1d_la if (CONFIG_T::reuse_factor > 118) pointwise_conv_1d_latency_cl(data_tmp[118], res_tmp[118], weights, biases); if (CONFIG_T::reuse_factor > 119) pointwise_conv_1d_latency_cl(data_tmp[119], res_tmp[119], weights, biases); + RFOutputLoop: for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + InnerOutputLoop: for(int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL res[jj*CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii]; From 23126b70ca5496bcc7da993d95a8d939920bd8bc Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 26 Mar 2023 17:19:08 -0700 Subject: [PATCH 05/41] add hls unroll --- hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 4179c1dde8..c5b520c703 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -175,6 +175,7 @@ template void pointwise_conv_1d_la RFInputLoop: for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + #pragma HLS UNROLL InnerInputLoop: for(int ii = 0; ii < CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL @@ -305,6 +306,7 @@ template void pointwise_conv_1d_la RFOutputLoop: for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + #pragma HLS UNROLL InnerOutputLoop: for(int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL From 6aff9e996df95955d010013c2163a723ab8a8170 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Thu, 8 Jun 2023 08:15:11 -0700 Subject: [PATCH 06/41] fix pragma from walkie --- hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index c5b520c703..c423c7a228 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -108,8 +108,8 @@ void pointwise_conv_1d_latency_cl( #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - //const int multiplier_limit = compute_multiplier_limit(weights); - //#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + int multiplier_limit = ceil ( (float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor )* CONFIG_T::n_filt * CONFIG_T::n_chan ) / float(CONFIG_T::reuse_factor) ); + #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit // Convolve, saving all multiplication results to accumulate later ConvOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { From 7f1c318dea6767d5b0e4996786c356d48bfa4560 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 14 Jun 2023 18:46:37 +0000 Subject: [PATCH 07/41] [pre-commit.ci] auto fixes from pre-commit hooks --- hls4ml/backends/vivado/vivado_backend.py | 4 +- .../templates/vivado/nnet_utils/nnet_common.h | 2 +- .../templates/vivado/nnet_utils/nnet_conv1d.h | 8 +- .../vivado/nnet_utils/nnet_conv1d_latency.h | 488 +++++++++++------- 4 files changed, 311 insertions(+), 191 deletions(-) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 4dab5f5c18..1eb58f0952 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -72,7 +72,9 @@ def _register_layer_attributes(self): for layer in cnn_layers: attrs = self.attribute_map.get(layer, []) # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer')) - attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer')) + attrs.append( + ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer') + ) self.attribute_map[layer] = attrs def _register_flows(self): diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h index b6582e1406..e942a1dc89 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h @@ -24,7 +24,7 @@ namespace nnet { // Common type definitions enum io_type { io_parallel = 0, io_stream }; enum strategy { latency, resource }; -enum class conv_implementation { linebuffer=0, encoded=1, pointwise=2}; +enum class conv_implementation { linebuffer = 0, encoded = 1, pointwise = 2 }; /* --- * Balanced tree reduce implementation. diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index e2dee3485a..0f2e89ac8f 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -54,17 +54,15 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], #pragma HLS INLINE region if (CONFIG_T::strategy == nnet::latency) { - if (CONFIG_T::implementation == conv_implementation::pointwise){ + if (CONFIG_T::implementation == conv_implementation::pointwise) { // Use pointwise unrolled implementation if (CONFIG_T::reuse_factor > 1 && CONFIG_T::reuse_factor <= 120) { pointwise_conv_1d_latency_cl_split_by_rf(data, res, weights, biases); - } - else { + } else { assert(CONFIG_T::reuse_factor == 1); pointwise_conv_1d_latency_cl(data, res, weights, biases); } - } - else { + } else { // Use standard unrolled implementation conv_1d_latency_cl(data, res, weights, biases); } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index c423c7a228..aabc869823 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -84,17 +84,15 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } } -template -void pointwise_conv_1d_latency_cl( - data_T data[CONFIG_T::in_width * CONFIG_T::n_chan/CONFIG_T::reuse_factor], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) -{ +template +void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { assert(CONFIG_T::filt_width == 1); - typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan/CONFIG_T::reuse_factor]; - typename CONFIG_T::accum_t acc[CONFIG_T::out_width/CONFIG_T::reuse_factor][CONFIG_T::n_filt]; + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 @@ -108,209 +106,331 @@ void pointwise_conv_1d_latency_cl( #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - int multiplier_limit = ceil ( (float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor )* CONFIG_T::n_filt * CONFIG_T::n_chan ) / float(CONFIG_T::reuse_factor) ); - #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit - - // Convolve, saving all multiplication results to accumulate later - ConvOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { - ConvFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { - ConvChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) { - #pragma HLS UNROLL - int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc; - int index_weight = cc*CONFIG_T::n_filt + ff; - int index_data = (ii*CONFIG_T::stride_width-CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; - - if((ii*CONFIG_T::stride_width) < CONFIG_T::pad_left || (ii*CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)){ + int multiplier_limit = + ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / + float(CONFIG_T::reuse_factor)); +#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit + +// Convolve, saving all multiplication results to accumulate later +ConvOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + ConvFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + #pragma HLS UNROLL + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + int index_weight = cc * CONFIG_T::n_filt + ff; + int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + + if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left || + (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { mult[index_mult] = 0; - } - else { + } else { mult[index_mult] = data[index_data] * weights[index_weight]; } - }//end channel loop - }//end filter loop - }//end output loop - + } // end channel loop + } // end filter loop + } // end output loop // Initialize accumulator with input biases - for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { - for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { #pragma HLS UNROLL - acc[ii][ff]=biases[ff]; + acc[ii][ff] = biases[ff]; } } - - // Accumulate multiplication result - AccumOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { - AccumFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { - //Do "dot product" sum within filter and sum over channels - AccumChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) { - int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc; +// Accumulate multiplication result +AccumOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + AccumFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Do "dot product" sum within filter and sum over channels + AccumChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; acc[ii][ff] += mult[index_mult]; - }//end channel loop - }//end filter loop - }//end output loop - + } // end channel loop + } // end filter loop + } // end output loop // Cast to "res_t" type - for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) { - for(int ff = 0; ff < CONFIG_T::n_filt; ff++) { + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { #pragma HLS UNROLL res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); } } } -template void pointwise_conv_1d_latency_cl_split_by_rf( - data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) -{ +template +void pointwise_conv_1d_latency_cl_split_by_rf(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor]; + data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0 - res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width*CONFIG_T::n_filt/CONFIG_T::reuse_factor]; + res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor]; #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0 - - RFInputLoop: - for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { - #pragma HLS UNROLL - InnerInputLoop: - for(int ii = 0; ii < CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor; ii++) { + +RFInputLoop: + for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + #pragma HLS UNROLL + InnerInputLoop: + for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL - data_tmp[jj][ii] = data[jj*CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor+ii]; + data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii]; } } pointwise_conv_1d_latency_cl(data_tmp[0], res_tmp[0], weights, biases); pointwise_conv_1d_latency_cl(data_tmp[1], res_tmp[1], weights, biases); - if (CONFIG_T::reuse_factor > 2) pointwise_conv_1d_latency_cl(data_tmp[2], res_tmp[2], weights, biases); - if (CONFIG_T::reuse_factor > 3) pointwise_conv_1d_latency_cl(data_tmp[3], res_tmp[3], weights, biases); - if (CONFIG_T::reuse_factor > 4) pointwise_conv_1d_latency_cl(data_tmp[4], res_tmp[4], weights, biases); - if (CONFIG_T::reuse_factor > 5) pointwise_conv_1d_latency_cl(data_tmp[5], res_tmp[5], weights, biases); - if (CONFIG_T::reuse_factor > 6) pointwise_conv_1d_latency_cl(data_tmp[6], res_tmp[6], weights, biases); - if (CONFIG_T::reuse_factor > 7) pointwise_conv_1d_latency_cl(data_tmp[7], res_tmp[7], weights, biases); - if (CONFIG_T::reuse_factor > 8) pointwise_conv_1d_latency_cl(data_tmp[8], res_tmp[8], weights, biases); - if (CONFIG_T::reuse_factor > 9) pointwise_conv_1d_latency_cl(data_tmp[9], res_tmp[9], weights, biases); - if (CONFIG_T::reuse_factor > 10) pointwise_conv_1d_latency_cl(data_tmp[10], res_tmp[10], weights, biases); - if (CONFIG_T::reuse_factor > 11) pointwise_conv_1d_latency_cl(data_tmp[11], res_tmp[11], weights, biases); - if (CONFIG_T::reuse_factor > 12) pointwise_conv_1d_latency_cl(data_tmp[12], res_tmp[12], weights, biases); - if (CONFIG_T::reuse_factor > 13) pointwise_conv_1d_latency_cl(data_tmp[13], res_tmp[13], weights, biases); - if (CONFIG_T::reuse_factor > 14) pointwise_conv_1d_latency_cl(data_tmp[14], res_tmp[14], weights, biases); - if (CONFIG_T::reuse_factor > 15) pointwise_conv_1d_latency_cl(data_tmp[15], res_tmp[15], weights, biases); - if (CONFIG_T::reuse_factor > 16) pointwise_conv_1d_latency_cl(data_tmp[16], res_tmp[16], weights, biases); - if (CONFIG_T::reuse_factor > 17) pointwise_conv_1d_latency_cl(data_tmp[17], res_tmp[17], weights, biases); - if (CONFIG_T::reuse_factor > 18) pointwise_conv_1d_latency_cl(data_tmp[18], res_tmp[18], weights, biases); - if (CONFIG_T::reuse_factor > 19) pointwise_conv_1d_latency_cl(data_tmp[19], res_tmp[19], weights, biases); - if (CONFIG_T::reuse_factor > 20) pointwise_conv_1d_latency_cl(data_tmp[20], res_tmp[20], weights, biases); - if (CONFIG_T::reuse_factor > 21) pointwise_conv_1d_latency_cl(data_tmp[21], res_tmp[21], weights, biases); - if (CONFIG_T::reuse_factor > 22) pointwise_conv_1d_latency_cl(data_tmp[22], res_tmp[22], weights, biases); - if (CONFIG_T::reuse_factor > 23) pointwise_conv_1d_latency_cl(data_tmp[23], res_tmp[23], weights, biases); - if (CONFIG_T::reuse_factor > 24) pointwise_conv_1d_latency_cl(data_tmp[24], res_tmp[24], weights, biases); - if (CONFIG_T::reuse_factor > 25) pointwise_conv_1d_latency_cl(data_tmp[25], res_tmp[25], weights, biases); - if (CONFIG_T::reuse_factor > 26) pointwise_conv_1d_latency_cl(data_tmp[26], res_tmp[26], weights, biases); - if (CONFIG_T::reuse_factor > 27) pointwise_conv_1d_latency_cl(data_tmp[27], res_tmp[27], weights, biases); - if (CONFIG_T::reuse_factor > 28) pointwise_conv_1d_latency_cl(data_tmp[28], res_tmp[28], weights, biases); - if (CONFIG_T::reuse_factor > 29) pointwise_conv_1d_latency_cl(data_tmp[29], res_tmp[29], weights, biases); - if (CONFIG_T::reuse_factor > 30) pointwise_conv_1d_latency_cl(data_tmp[30], res_tmp[30], weights, biases); - if (CONFIG_T::reuse_factor > 31) pointwise_conv_1d_latency_cl(data_tmp[31], res_tmp[31], weights, biases); - if (CONFIG_T::reuse_factor > 32) pointwise_conv_1d_latency_cl(data_tmp[32], res_tmp[32], weights, biases); - if (CONFIG_T::reuse_factor > 33) pointwise_conv_1d_latency_cl(data_tmp[33], res_tmp[33], weights, biases); - if (CONFIG_T::reuse_factor > 34) pointwise_conv_1d_latency_cl(data_tmp[34], res_tmp[34], weights, biases); - if (CONFIG_T::reuse_factor > 35) pointwise_conv_1d_latency_cl(data_tmp[35], res_tmp[35], weights, biases); - if (CONFIG_T::reuse_factor > 36) pointwise_conv_1d_latency_cl(data_tmp[36], res_tmp[36], weights, biases); - if (CONFIG_T::reuse_factor > 37) pointwise_conv_1d_latency_cl(data_tmp[37], res_tmp[37], weights, biases); - if (CONFIG_T::reuse_factor > 38) pointwise_conv_1d_latency_cl(data_tmp[38], res_tmp[38], weights, biases); - if (CONFIG_T::reuse_factor > 39) pointwise_conv_1d_latency_cl(data_tmp[39], res_tmp[39], weights, biases); - if (CONFIG_T::reuse_factor > 40) pointwise_conv_1d_latency_cl(data_tmp[40], res_tmp[40], weights, biases); - if (CONFIG_T::reuse_factor > 41) pointwise_conv_1d_latency_cl(data_tmp[41], res_tmp[41], weights, biases); - if (CONFIG_T::reuse_factor > 42) pointwise_conv_1d_latency_cl(data_tmp[42], res_tmp[42], weights, biases); - if (CONFIG_T::reuse_factor > 43) pointwise_conv_1d_latency_cl(data_tmp[43], res_tmp[43], weights, biases); - if (CONFIG_T::reuse_factor > 44) pointwise_conv_1d_latency_cl(data_tmp[44], res_tmp[44], weights, biases); - if (CONFIG_T::reuse_factor > 45) pointwise_conv_1d_latency_cl(data_tmp[45], res_tmp[45], weights, biases); - if (CONFIG_T::reuse_factor > 46) pointwise_conv_1d_latency_cl(data_tmp[46], res_tmp[45], weights, biases); - if (CONFIG_T::reuse_factor > 47) pointwise_conv_1d_latency_cl(data_tmp[47], res_tmp[47], weights, biases); - if (CONFIG_T::reuse_factor > 48) pointwise_conv_1d_latency_cl(data_tmp[48], res_tmp[48], weights, biases); - if (CONFIG_T::reuse_factor > 49) pointwise_conv_1d_latency_cl(data_tmp[49], res_tmp[49], weights, biases); - if (CONFIG_T::reuse_factor > 50) pointwise_conv_1d_latency_cl(data_tmp[50], res_tmp[50], weights, biases); - if (CONFIG_T::reuse_factor > 51) pointwise_conv_1d_latency_cl(data_tmp[51], res_tmp[51], weights, biases); - if (CONFIG_T::reuse_factor > 52) pointwise_conv_1d_latency_cl(data_tmp[52], res_tmp[52], weights, biases); - if (CONFIG_T::reuse_factor > 53) pointwise_conv_1d_latency_cl(data_tmp[53], res_tmp[53], weights, biases); - if (CONFIG_T::reuse_factor > 54) pointwise_conv_1d_latency_cl(data_tmp[54], res_tmp[54], weights, biases); - if (CONFIG_T::reuse_factor > 55) pointwise_conv_1d_latency_cl(data_tmp[55], res_tmp[55], weights, biases); - if (CONFIG_T::reuse_factor > 56) pointwise_conv_1d_latency_cl(data_tmp[56], res_tmp[55], weights, biases); - if (CONFIG_T::reuse_factor > 57) pointwise_conv_1d_latency_cl(data_tmp[57], res_tmp[57], weights, biases); - if (CONFIG_T::reuse_factor > 58) pointwise_conv_1d_latency_cl(data_tmp[58], res_tmp[58], weights, biases); - if (CONFIG_T::reuse_factor > 59) pointwise_conv_1d_latency_cl(data_tmp[59], res_tmp[59], weights, biases); - if (CONFIG_T::reuse_factor > 60) pointwise_conv_1d_latency_cl(data_tmp[60], res_tmp[60], weights, biases); - if (CONFIG_T::reuse_factor > 61) pointwise_conv_1d_latency_cl(data_tmp[61], res_tmp[61], weights, biases); - if (CONFIG_T::reuse_factor > 62) pointwise_conv_1d_latency_cl(data_tmp[62], res_tmp[62], weights, biases); - if (CONFIG_T::reuse_factor > 63) pointwise_conv_1d_latency_cl(data_tmp[63], res_tmp[63], weights, biases); - if (CONFIG_T::reuse_factor > 64) pointwise_conv_1d_latency_cl(data_tmp[64], res_tmp[64], weights, biases); - if (CONFIG_T::reuse_factor > 65) pointwise_conv_1d_latency_cl(data_tmp[65], res_tmp[65], weights, biases); - if (CONFIG_T::reuse_factor > 66) pointwise_conv_1d_latency_cl(data_tmp[66], res_tmp[66], weights, biases); - if (CONFIG_T::reuse_factor > 67) pointwise_conv_1d_latency_cl(data_tmp[67], res_tmp[67], weights, biases); - if (CONFIG_T::reuse_factor > 68) pointwise_conv_1d_latency_cl(data_tmp[68], res_tmp[68], weights, biases); - if (CONFIG_T::reuse_factor > 69) pointwise_conv_1d_latency_cl(data_tmp[69], res_tmp[69], weights, biases); - if (CONFIG_T::reuse_factor > 70) pointwise_conv_1d_latency_cl(data_tmp[70], res_tmp[70], weights, biases); - if (CONFIG_T::reuse_factor > 71) pointwise_conv_1d_latency_cl(data_tmp[71], res_tmp[71], weights, biases); - if (CONFIG_T::reuse_factor > 72) pointwise_conv_1d_latency_cl(data_tmp[72], res_tmp[72], weights, biases); - if (CONFIG_T::reuse_factor > 73) pointwise_conv_1d_latency_cl(data_tmp[73], res_tmp[73], weights, biases); - if (CONFIG_T::reuse_factor > 74) pointwise_conv_1d_latency_cl(data_tmp[74], res_tmp[74], weights, biases); - if (CONFIG_T::reuse_factor > 75) pointwise_conv_1d_latency_cl(data_tmp[75], res_tmp[75], weights, biases); - if (CONFIG_T::reuse_factor > 76) pointwise_conv_1d_latency_cl(data_tmp[76], res_tmp[76], weights, biases); - if (CONFIG_T::reuse_factor > 77) pointwise_conv_1d_latency_cl(data_tmp[77], res_tmp[77], weights, biases); - if (CONFIG_T::reuse_factor > 78) pointwise_conv_1d_latency_cl(data_tmp[78], res_tmp[78], weights, biases); - if (CONFIG_T::reuse_factor > 79) pointwise_conv_1d_latency_cl(data_tmp[79], res_tmp[79], weights, biases); - if (CONFIG_T::reuse_factor > 80) pointwise_conv_1d_latency_cl(data_tmp[80], res_tmp[80], weights, biases); - if (CONFIG_T::reuse_factor > 81) pointwise_conv_1d_latency_cl(data_tmp[81], res_tmp[81], weights, biases); - if (CONFIG_T::reuse_factor > 82) pointwise_conv_1d_latency_cl(data_tmp[82], res_tmp[82], weights, biases); - if (CONFIG_T::reuse_factor > 83) pointwise_conv_1d_latency_cl(data_tmp[83], res_tmp[83], weights, biases); - if (CONFIG_T::reuse_factor > 84) pointwise_conv_1d_latency_cl(data_tmp[84], res_tmp[84], weights, biases); - if (CONFIG_T::reuse_factor > 85) pointwise_conv_1d_latency_cl(data_tmp[85], res_tmp[85], weights, biases); - if (CONFIG_T::reuse_factor > 86) pointwise_conv_1d_latency_cl(data_tmp[86], res_tmp[86], weights, biases); - if (CONFIG_T::reuse_factor > 87) pointwise_conv_1d_latency_cl(data_tmp[87], res_tmp[87], weights, biases); - if (CONFIG_T::reuse_factor > 88) pointwise_conv_1d_latency_cl(data_tmp[88], res_tmp[88], weights, biases); - if (CONFIG_T::reuse_factor > 89) pointwise_conv_1d_latency_cl(data_tmp[89], res_tmp[89], weights, biases); - if (CONFIG_T::reuse_factor > 90) pointwise_conv_1d_latency_cl(data_tmp[90], res_tmp[90], weights, biases); - if (CONFIG_T::reuse_factor > 91) pointwise_conv_1d_latency_cl(data_tmp[91], res_tmp[91], weights, biases); - if (CONFIG_T::reuse_factor > 92) pointwise_conv_1d_latency_cl(data_tmp[92], res_tmp[92], weights, biases); - if (CONFIG_T::reuse_factor > 93) pointwise_conv_1d_latency_cl(data_tmp[93], res_tmp[93], weights, biases); - if (CONFIG_T::reuse_factor > 94) pointwise_conv_1d_latency_cl(data_tmp[94], res_tmp[94], weights, biases); - if (CONFIG_T::reuse_factor > 95) pointwise_conv_1d_latency_cl(data_tmp[95], res_tmp[95], weights, biases); - if (CONFIG_T::reuse_factor > 96) pointwise_conv_1d_latency_cl(data_tmp[96], res_tmp[96], weights, biases); - if (CONFIG_T::reuse_factor > 97) pointwise_conv_1d_latency_cl(data_tmp[97], res_tmp[97], weights, biases); - if (CONFIG_T::reuse_factor > 98) pointwise_conv_1d_latency_cl(data_tmp[98], res_tmp[98], weights, biases); - if (CONFIG_T::reuse_factor > 99) pointwise_conv_1d_latency_cl(data_tmp[99], res_tmp[99], weights, biases); - if (CONFIG_T::reuse_factor > 100) pointwise_conv_1d_latency_cl(data_tmp[100], res_tmp[100], weights, biases); - if (CONFIG_T::reuse_factor > 101) pointwise_conv_1d_latency_cl(data_tmp[101], res_tmp[101], weights, biases); - if (CONFIG_T::reuse_factor > 102) pointwise_conv_1d_latency_cl(data_tmp[102], res_tmp[102], weights, biases); - if (CONFIG_T::reuse_factor > 103) pointwise_conv_1d_latency_cl(data_tmp[103], res_tmp[103], weights, biases); - if (CONFIG_T::reuse_factor > 104) pointwise_conv_1d_latency_cl(data_tmp[104], res_tmp[104], weights, biases); - if (CONFIG_T::reuse_factor > 105) pointwise_conv_1d_latency_cl(data_tmp[105], res_tmp[105], weights, biases); - if (CONFIG_T::reuse_factor > 106) pointwise_conv_1d_latency_cl(data_tmp[106], res_tmp[106], weights, biases); - if (CONFIG_T::reuse_factor > 107) pointwise_conv_1d_latency_cl(data_tmp[107], res_tmp[107], weights, biases); - if (CONFIG_T::reuse_factor > 108) pointwise_conv_1d_latency_cl(data_tmp[108], res_tmp[108], weights, biases); - if (CONFIG_T::reuse_factor > 109) pointwise_conv_1d_latency_cl(data_tmp[109], res_tmp[109], weights, biases); - if (CONFIG_T::reuse_factor > 110) pointwise_conv_1d_latency_cl(data_tmp[110], res_tmp[110], weights, biases); - if (CONFIG_T::reuse_factor > 111) pointwise_conv_1d_latency_cl(data_tmp[111], res_tmp[111], weights, biases); - if (CONFIG_T::reuse_factor > 112) pointwise_conv_1d_latency_cl(data_tmp[112], res_tmp[112], weights, biases); - if (CONFIG_T::reuse_factor > 113) pointwise_conv_1d_latency_cl(data_tmp[113], res_tmp[113], weights, biases); - if (CONFIG_T::reuse_factor > 114) pointwise_conv_1d_latency_cl(data_tmp[114], res_tmp[114], weights, biases); - if (CONFIG_T::reuse_factor > 115) pointwise_conv_1d_latency_cl(data_tmp[115], res_tmp[115], weights, biases); - if (CONFIG_T::reuse_factor > 116) pointwise_conv_1d_latency_cl(data_tmp[116], res_tmp[116], weights, biases); - if (CONFIG_T::reuse_factor > 117) pointwise_conv_1d_latency_cl(data_tmp[117], res_tmp[117], weights, biases); - if (CONFIG_T::reuse_factor > 118) pointwise_conv_1d_latency_cl(data_tmp[118], res_tmp[118], weights, biases); - if (CONFIG_T::reuse_factor > 119) pointwise_conv_1d_latency_cl(data_tmp[119], res_tmp[119], weights, biases); - - RFOutputLoop: - for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { - #pragma HLS UNROLL - InnerOutputLoop: - for(int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor; ii++) { + if (CONFIG_T::reuse_factor > 2) + pointwise_conv_1d_latency_cl(data_tmp[2], res_tmp[2], weights, biases); + if (CONFIG_T::reuse_factor > 3) + pointwise_conv_1d_latency_cl(data_tmp[3], res_tmp[3], weights, biases); + if (CONFIG_T::reuse_factor > 4) + pointwise_conv_1d_latency_cl(data_tmp[4], res_tmp[4], weights, biases); + if (CONFIG_T::reuse_factor > 5) + pointwise_conv_1d_latency_cl(data_tmp[5], res_tmp[5], weights, biases); + if (CONFIG_T::reuse_factor > 6) + pointwise_conv_1d_latency_cl(data_tmp[6], res_tmp[6], weights, biases); + if (CONFIG_T::reuse_factor > 7) + pointwise_conv_1d_latency_cl(data_tmp[7], res_tmp[7], weights, biases); + if (CONFIG_T::reuse_factor > 8) + pointwise_conv_1d_latency_cl(data_tmp[8], res_tmp[8], weights, biases); + if (CONFIG_T::reuse_factor > 9) + pointwise_conv_1d_latency_cl(data_tmp[9], res_tmp[9], weights, biases); + if (CONFIG_T::reuse_factor > 10) + pointwise_conv_1d_latency_cl(data_tmp[10], res_tmp[10], weights, biases); + if (CONFIG_T::reuse_factor > 11) + pointwise_conv_1d_latency_cl(data_tmp[11], res_tmp[11], weights, biases); + if (CONFIG_T::reuse_factor > 12) + pointwise_conv_1d_latency_cl(data_tmp[12], res_tmp[12], weights, biases); + if (CONFIG_T::reuse_factor > 13) + pointwise_conv_1d_latency_cl(data_tmp[13], res_tmp[13], weights, biases); + if (CONFIG_T::reuse_factor > 14) + pointwise_conv_1d_latency_cl(data_tmp[14], res_tmp[14], weights, biases); + if (CONFIG_T::reuse_factor > 15) + pointwise_conv_1d_latency_cl(data_tmp[15], res_tmp[15], weights, biases); + if (CONFIG_T::reuse_factor > 16) + pointwise_conv_1d_latency_cl(data_tmp[16], res_tmp[16], weights, biases); + if (CONFIG_T::reuse_factor > 17) + pointwise_conv_1d_latency_cl(data_tmp[17], res_tmp[17], weights, biases); + if (CONFIG_T::reuse_factor > 18) + pointwise_conv_1d_latency_cl(data_tmp[18], res_tmp[18], weights, biases); + if (CONFIG_T::reuse_factor > 19) + pointwise_conv_1d_latency_cl(data_tmp[19], res_tmp[19], weights, biases); + if (CONFIG_T::reuse_factor > 20) + pointwise_conv_1d_latency_cl(data_tmp[20], res_tmp[20], weights, biases); + if (CONFIG_T::reuse_factor > 21) + pointwise_conv_1d_latency_cl(data_tmp[21], res_tmp[21], weights, biases); + if (CONFIG_T::reuse_factor > 22) + pointwise_conv_1d_latency_cl(data_tmp[22], res_tmp[22], weights, biases); + if (CONFIG_T::reuse_factor > 23) + pointwise_conv_1d_latency_cl(data_tmp[23], res_tmp[23], weights, biases); + if (CONFIG_T::reuse_factor > 24) + pointwise_conv_1d_latency_cl(data_tmp[24], res_tmp[24], weights, biases); + if (CONFIG_T::reuse_factor > 25) + pointwise_conv_1d_latency_cl(data_tmp[25], res_tmp[25], weights, biases); + if (CONFIG_T::reuse_factor > 26) + pointwise_conv_1d_latency_cl(data_tmp[26], res_tmp[26], weights, biases); + if (CONFIG_T::reuse_factor > 27) + pointwise_conv_1d_latency_cl(data_tmp[27], res_tmp[27], weights, biases); + if (CONFIG_T::reuse_factor > 28) + pointwise_conv_1d_latency_cl(data_tmp[28], res_tmp[28], weights, biases); + if (CONFIG_T::reuse_factor > 29) + pointwise_conv_1d_latency_cl(data_tmp[29], res_tmp[29], weights, biases); + if (CONFIG_T::reuse_factor > 30) + pointwise_conv_1d_latency_cl(data_tmp[30], res_tmp[30], weights, biases); + if (CONFIG_T::reuse_factor > 31) + pointwise_conv_1d_latency_cl(data_tmp[31], res_tmp[31], weights, biases); + if (CONFIG_T::reuse_factor > 32) + pointwise_conv_1d_latency_cl(data_tmp[32], res_tmp[32], weights, biases); + if (CONFIG_T::reuse_factor > 33) + pointwise_conv_1d_latency_cl(data_tmp[33], res_tmp[33], weights, biases); + if (CONFIG_T::reuse_factor > 34) + pointwise_conv_1d_latency_cl(data_tmp[34], res_tmp[34], weights, biases); + if (CONFIG_T::reuse_factor > 35) + pointwise_conv_1d_latency_cl(data_tmp[35], res_tmp[35], weights, biases); + if (CONFIG_T::reuse_factor > 36) + pointwise_conv_1d_latency_cl(data_tmp[36], res_tmp[36], weights, biases); + if (CONFIG_T::reuse_factor > 37) + pointwise_conv_1d_latency_cl(data_tmp[37], res_tmp[37], weights, biases); + if (CONFIG_T::reuse_factor > 38) + pointwise_conv_1d_latency_cl(data_tmp[38], res_tmp[38], weights, biases); + if (CONFIG_T::reuse_factor > 39) + pointwise_conv_1d_latency_cl(data_tmp[39], res_tmp[39], weights, biases); + if (CONFIG_T::reuse_factor > 40) + pointwise_conv_1d_latency_cl(data_tmp[40], res_tmp[40], weights, biases); + if (CONFIG_T::reuse_factor > 41) + pointwise_conv_1d_latency_cl(data_tmp[41], res_tmp[41], weights, biases); + if (CONFIG_T::reuse_factor > 42) + pointwise_conv_1d_latency_cl(data_tmp[42], res_tmp[42], weights, biases); + if (CONFIG_T::reuse_factor > 43) + pointwise_conv_1d_latency_cl(data_tmp[43], res_tmp[43], weights, biases); + if (CONFIG_T::reuse_factor > 44) + pointwise_conv_1d_latency_cl(data_tmp[44], res_tmp[44], weights, biases); + if (CONFIG_T::reuse_factor > 45) + pointwise_conv_1d_latency_cl(data_tmp[45], res_tmp[45], weights, biases); + if (CONFIG_T::reuse_factor > 46) + pointwise_conv_1d_latency_cl(data_tmp[46], res_tmp[45], weights, biases); + if (CONFIG_T::reuse_factor > 47) + pointwise_conv_1d_latency_cl(data_tmp[47], res_tmp[47], weights, biases); + if (CONFIG_T::reuse_factor > 48) + pointwise_conv_1d_latency_cl(data_tmp[48], res_tmp[48], weights, biases); + if (CONFIG_T::reuse_factor > 49) + pointwise_conv_1d_latency_cl(data_tmp[49], res_tmp[49], weights, biases); + if (CONFIG_T::reuse_factor > 50) + pointwise_conv_1d_latency_cl(data_tmp[50], res_tmp[50], weights, biases); + if (CONFIG_T::reuse_factor > 51) + pointwise_conv_1d_latency_cl(data_tmp[51], res_tmp[51], weights, biases); + if (CONFIG_T::reuse_factor > 52) + pointwise_conv_1d_latency_cl(data_tmp[52], res_tmp[52], weights, biases); + if (CONFIG_T::reuse_factor > 53) + pointwise_conv_1d_latency_cl(data_tmp[53], res_tmp[53], weights, biases); + if (CONFIG_T::reuse_factor > 54) + pointwise_conv_1d_latency_cl(data_tmp[54], res_tmp[54], weights, biases); + if (CONFIG_T::reuse_factor > 55) + pointwise_conv_1d_latency_cl(data_tmp[55], res_tmp[55], weights, biases); + if (CONFIG_T::reuse_factor > 56) + pointwise_conv_1d_latency_cl(data_tmp[56], res_tmp[55], weights, biases); + if (CONFIG_T::reuse_factor > 57) + pointwise_conv_1d_latency_cl(data_tmp[57], res_tmp[57], weights, biases); + if (CONFIG_T::reuse_factor > 58) + pointwise_conv_1d_latency_cl(data_tmp[58], res_tmp[58], weights, biases); + if (CONFIG_T::reuse_factor > 59) + pointwise_conv_1d_latency_cl(data_tmp[59], res_tmp[59], weights, biases); + if (CONFIG_T::reuse_factor > 60) + pointwise_conv_1d_latency_cl(data_tmp[60], res_tmp[60], weights, biases); + if (CONFIG_T::reuse_factor > 61) + pointwise_conv_1d_latency_cl(data_tmp[61], res_tmp[61], weights, biases); + if (CONFIG_T::reuse_factor > 62) + pointwise_conv_1d_latency_cl(data_tmp[62], res_tmp[62], weights, biases); + if (CONFIG_T::reuse_factor > 63) + pointwise_conv_1d_latency_cl(data_tmp[63], res_tmp[63], weights, biases); + if (CONFIG_T::reuse_factor > 64) + pointwise_conv_1d_latency_cl(data_tmp[64], res_tmp[64], weights, biases); + if (CONFIG_T::reuse_factor > 65) + pointwise_conv_1d_latency_cl(data_tmp[65], res_tmp[65], weights, biases); + if (CONFIG_T::reuse_factor > 66) + pointwise_conv_1d_latency_cl(data_tmp[66], res_tmp[66], weights, biases); + if (CONFIG_T::reuse_factor > 67) + pointwise_conv_1d_latency_cl(data_tmp[67], res_tmp[67], weights, biases); + if (CONFIG_T::reuse_factor > 68) + pointwise_conv_1d_latency_cl(data_tmp[68], res_tmp[68], weights, biases); + if (CONFIG_T::reuse_factor > 69) + pointwise_conv_1d_latency_cl(data_tmp[69], res_tmp[69], weights, biases); + if (CONFIG_T::reuse_factor > 70) + pointwise_conv_1d_latency_cl(data_tmp[70], res_tmp[70], weights, biases); + if (CONFIG_T::reuse_factor > 71) + pointwise_conv_1d_latency_cl(data_tmp[71], res_tmp[71], weights, biases); + if (CONFIG_T::reuse_factor > 72) + pointwise_conv_1d_latency_cl(data_tmp[72], res_tmp[72], weights, biases); + if (CONFIG_T::reuse_factor > 73) + pointwise_conv_1d_latency_cl(data_tmp[73], res_tmp[73], weights, biases); + if (CONFIG_T::reuse_factor > 74) + pointwise_conv_1d_latency_cl(data_tmp[74], res_tmp[74], weights, biases); + if (CONFIG_T::reuse_factor > 75) + pointwise_conv_1d_latency_cl(data_tmp[75], res_tmp[75], weights, biases); + if (CONFIG_T::reuse_factor > 76) + pointwise_conv_1d_latency_cl(data_tmp[76], res_tmp[76], weights, biases); + if (CONFIG_T::reuse_factor > 77) + pointwise_conv_1d_latency_cl(data_tmp[77], res_tmp[77], weights, biases); + if (CONFIG_T::reuse_factor > 78) + pointwise_conv_1d_latency_cl(data_tmp[78], res_tmp[78], weights, biases); + if (CONFIG_T::reuse_factor > 79) + pointwise_conv_1d_latency_cl(data_tmp[79], res_tmp[79], weights, biases); + if (CONFIG_T::reuse_factor > 80) + pointwise_conv_1d_latency_cl(data_tmp[80], res_tmp[80], weights, biases); + if (CONFIG_T::reuse_factor > 81) + pointwise_conv_1d_latency_cl(data_tmp[81], res_tmp[81], weights, biases); + if (CONFIG_T::reuse_factor > 82) + pointwise_conv_1d_latency_cl(data_tmp[82], res_tmp[82], weights, biases); + if (CONFIG_T::reuse_factor > 83) + pointwise_conv_1d_latency_cl(data_tmp[83], res_tmp[83], weights, biases); + if (CONFIG_T::reuse_factor > 84) + pointwise_conv_1d_latency_cl(data_tmp[84], res_tmp[84], weights, biases); + if (CONFIG_T::reuse_factor > 85) + pointwise_conv_1d_latency_cl(data_tmp[85], res_tmp[85], weights, biases); + if (CONFIG_T::reuse_factor > 86) + pointwise_conv_1d_latency_cl(data_tmp[86], res_tmp[86], weights, biases); + if (CONFIG_T::reuse_factor > 87) + pointwise_conv_1d_latency_cl(data_tmp[87], res_tmp[87], weights, biases); + if (CONFIG_T::reuse_factor > 88) + pointwise_conv_1d_latency_cl(data_tmp[88], res_tmp[88], weights, biases); + if (CONFIG_T::reuse_factor > 89) + pointwise_conv_1d_latency_cl(data_tmp[89], res_tmp[89], weights, biases); + if (CONFIG_T::reuse_factor > 90) + pointwise_conv_1d_latency_cl(data_tmp[90], res_tmp[90], weights, biases); + if (CONFIG_T::reuse_factor > 91) + pointwise_conv_1d_latency_cl(data_tmp[91], res_tmp[91], weights, biases); + if (CONFIG_T::reuse_factor > 92) + pointwise_conv_1d_latency_cl(data_tmp[92], res_tmp[92], weights, biases); + if (CONFIG_T::reuse_factor > 93) + pointwise_conv_1d_latency_cl(data_tmp[93], res_tmp[93], weights, biases); + if (CONFIG_T::reuse_factor > 94) + pointwise_conv_1d_latency_cl(data_tmp[94], res_tmp[94], weights, biases); + if (CONFIG_T::reuse_factor > 95) + pointwise_conv_1d_latency_cl(data_tmp[95], res_tmp[95], weights, biases); + if (CONFIG_T::reuse_factor > 96) + pointwise_conv_1d_latency_cl(data_tmp[96], res_tmp[96], weights, biases); + if (CONFIG_T::reuse_factor > 97) + pointwise_conv_1d_latency_cl(data_tmp[97], res_tmp[97], weights, biases); + if (CONFIG_T::reuse_factor > 98) + pointwise_conv_1d_latency_cl(data_tmp[98], res_tmp[98], weights, biases); + if (CONFIG_T::reuse_factor > 99) + pointwise_conv_1d_latency_cl(data_tmp[99], res_tmp[99], weights, biases); + if (CONFIG_T::reuse_factor > 100) + pointwise_conv_1d_latency_cl(data_tmp[100], res_tmp[100], weights, biases); + if (CONFIG_T::reuse_factor > 101) + pointwise_conv_1d_latency_cl(data_tmp[101], res_tmp[101], weights, biases); + if (CONFIG_T::reuse_factor > 102) + pointwise_conv_1d_latency_cl(data_tmp[102], res_tmp[102], weights, biases); + if (CONFIG_T::reuse_factor > 103) + pointwise_conv_1d_latency_cl(data_tmp[103], res_tmp[103], weights, biases); + if (CONFIG_T::reuse_factor > 104) + pointwise_conv_1d_latency_cl(data_tmp[104], res_tmp[104], weights, biases); + if (CONFIG_T::reuse_factor > 105) + pointwise_conv_1d_latency_cl(data_tmp[105], res_tmp[105], weights, biases); + if (CONFIG_T::reuse_factor > 106) + pointwise_conv_1d_latency_cl(data_tmp[106], res_tmp[106], weights, biases); + if (CONFIG_T::reuse_factor > 107) + pointwise_conv_1d_latency_cl(data_tmp[107], res_tmp[107], weights, biases); + if (CONFIG_T::reuse_factor > 108) + pointwise_conv_1d_latency_cl(data_tmp[108], res_tmp[108], weights, biases); + if (CONFIG_T::reuse_factor > 109) + pointwise_conv_1d_latency_cl(data_tmp[109], res_tmp[109], weights, biases); + if (CONFIG_T::reuse_factor > 110) + pointwise_conv_1d_latency_cl(data_tmp[110], res_tmp[110], weights, biases); + if (CONFIG_T::reuse_factor > 111) + pointwise_conv_1d_latency_cl(data_tmp[111], res_tmp[111], weights, biases); + if (CONFIG_T::reuse_factor > 112) + pointwise_conv_1d_latency_cl(data_tmp[112], res_tmp[112], weights, biases); + if (CONFIG_T::reuse_factor > 113) + pointwise_conv_1d_latency_cl(data_tmp[113], res_tmp[113], weights, biases); + if (CONFIG_T::reuse_factor > 114) + pointwise_conv_1d_latency_cl(data_tmp[114], res_tmp[114], weights, biases); + if (CONFIG_T::reuse_factor > 115) + pointwise_conv_1d_latency_cl(data_tmp[115], res_tmp[115], weights, biases); + if (CONFIG_T::reuse_factor > 116) + pointwise_conv_1d_latency_cl(data_tmp[116], res_tmp[116], weights, biases); + if (CONFIG_T::reuse_factor > 117) + pointwise_conv_1d_latency_cl(data_tmp[117], res_tmp[117], weights, biases); + if (CONFIG_T::reuse_factor > 118) + pointwise_conv_1d_latency_cl(data_tmp[118], res_tmp[118], weights, biases); + if (CONFIG_T::reuse_factor > 119) + pointwise_conv_1d_latency_cl(data_tmp[119], res_tmp[119], weights, biases); + +RFOutputLoop: + for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + #pragma HLS UNROLL + InnerOutputLoop: + for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) { #pragma HLS UNROLL - res[jj*CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii]; + res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii]; } } } From 69aecc6dc187a6e9a1ecdd2e7449629f1a88e87b Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 14 Jun 2023 17:27:20 -0700 Subject: [PATCH 08/41] add test --- hls4ml/backends/vivado/vivado_backend.py | 1 - test/pytest/test_pointwiseconv.py | 37 ++++++++++++------------ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 1eb58f0952..1a99d90a8e 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -71,7 +71,6 @@ def _register_layer_attributes(self): for layer in cnn_layers: attrs = self.attribute_map.get(layer, []) - # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer')) attrs.append( ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer') ) diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index 28314fe130..080106955e 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -21,20 +21,22 @@ @pytest.mark.parametrize('padds', padds_options) @pytest.mark.parametrize('strides', strides1d_options) @pytest.mark.parametrize( - 'backend, io_type, strategy', + 'backend, io_type, strategy, conv_implementation', [ - ('Quartus', 'io_parallel', 'resource'), - ('Vivado', 'io_parallel', 'resource'), - ('Vitis', 'io_parallel', 'resource'), - ('Vivado', 'io_parallel', 'latency'), - ('Vitis', 'io_parallel', 'latency'), - ('Vivado', 'io_stream', 'latency'), - ('Vivado', 'io_stream', 'resource'), - ('Vitis', 'io_stream', 'latency'), - ('Vitis', 'io_stream', 'resource'), + ('Quartus', 'io_parallel', 'resource', 'LineBuffer'), + ('Vivado', 'io_parallel', 'resource', 'LineBuffer'), + ('Vitis', 'io_parallel', 'resource', 'LineBuffer'), + ('Vivado', 'io_parallel', 'latency', 'LineBuffer'), + ('Vitis', 'io_parallel', 'latency', 'LineBuffer'), + ('Vivado', 'io_parallel', 'latency', 'Pointwise'), + ('Vitis', 'io_parallel', 'latency', 'Pointwise'), + ('Vivado', 'io_stream', 'latency', 'LineBuffer'), + ('Vivado', 'io_stream', 'resource', 'LineBuffer'), + ('Vitis', 'io_stream', 'latency', 'LineBuffer'), + ('Vitis', 'io_stream', 'resource', 'LineBuffer'), ], ) -def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy): +def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_implementation): model = tf.keras.models.Sequential() input_shape = (28, 3) model.add( @@ -47,6 +49,7 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy): kernel_initializer='normal', use_bias=False, data_format=chans, + name='pointwise1d' ) ) model.compile(optimizer='adam', loss='mse') @@ -55,14 +58,13 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy): keras_prediction = model.predict(X_input) default_precision = 'ac_fixed<32,16,true>' if backend == 'Quartus' else 'ap_fixed<32,16>' - config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision) + config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision, granularity='name') config['Model']['Strategy'] = strategy + config['LayerName']['pointwise1d']['ConvImplementation'] = conv_implementation output_dir = str( test_root_path - / 'hls4mlprj_pointwise1d_{}_strides_{}_{}_padding_{}_{}_{}'.format( - chans, strides[0], padds, backend, io_type, strategy - ) + / f'hls4mlprj_pointwise1d_{chans}_strides_{strides[0]}_{padds}_padding_{backend}_{io_type}_{strategy}_{conv_implementation}' ) hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend @@ -100,6 +102,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy): kernel_initializer='normal', use_bias=False, data_format=chans, + name='pointwise2d' ) ) @@ -114,9 +117,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy): stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '') output_dir = str( test_root_path - / 'hls4mlprj_pointwise2d_{}_strides_{}_{}_padding_{}_{}_{}'.format( - chans, stride_cfg, padds, backend, io_type, strategy - ) + / f'hls4mlprj_pointwise2d_{chans}_strides_{stride_cfg}_{padds}_padding_{backend}_{io_type}_{strategy}' ) hls_model = hls4ml.converters.convert_from_keras_model( From 4febceded10000b3b1b6b4254c9b9c230a9f475c Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 14 Jun 2023 17:48:44 -0700 Subject: [PATCH 09/41] pre-commit --- test/pytest/test_pointwiseconv.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index 080106955e..0cb75b7a87 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -49,7 +49,7 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv kernel_initializer='normal', use_bias=False, data_format=chans, - name='pointwise1d' + name='pointwise1d', ) ) model.compile(optimizer='adam', loss='mse') @@ -102,7 +102,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy): kernel_initializer='normal', use_bias=False, data_format=chans, - name='pointwise2d' + name='pointwise2d', ) ) @@ -116,8 +116,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy): config['Model']['Strategy'] = strategy stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '') output_dir = str( - test_root_path - / f'hls4mlprj_pointwise2d_{chans}_strides_{stride_cfg}_{padds}_padding_{backend}_{io_type}_{strategy}' + test_root_path / f'hls4mlprj_pointwise2d_{chans}_strides_{stride_cfg}_{padds}_padding_{backend}_{io_type}_{strategy}' ) hls_model = hls4ml.converters.convert_from_keras_model( From 56797e73ecb1a830c28128387536308fd3f50beb Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 14 Jun 2023 17:53:37 -0700 Subject: [PATCH 10/41] pre-commit --- test/pytest/test_pointwiseconv.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index 0cb75b7a87..cbe2036712 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -21,7 +21,7 @@ @pytest.mark.parametrize('padds', padds_options) @pytest.mark.parametrize('strides', strides1d_options) @pytest.mark.parametrize( - 'backend, io_type, strategy, conv_implementation', + 'backend, io_type, strategy, conv_impl', [ ('Quartus', 'io_parallel', 'resource', 'LineBuffer'), ('Vivado', 'io_parallel', 'resource', 'LineBuffer'), @@ -36,7 +36,7 @@ ('Vitis', 'io_stream', 'resource', 'LineBuffer'), ], ) -def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_implementation): +def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_impl): model = tf.keras.models.Sequential() input_shape = (28, 3) model.add( @@ -60,11 +60,11 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv default_precision = 'ac_fixed<32,16,true>' if backend == 'Quartus' else 'ap_fixed<32,16>' config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision, granularity='name') config['Model']['Strategy'] = strategy - config['LayerName']['pointwise1d']['ConvImplementation'] = conv_implementation + config['LayerName']['pointwise1d']['ConvImplementation'] = conv_impl output_dir = str( test_root_path - / f'hls4mlprj_pointwise1d_{chans}_strides_{strides[0]}_{padds}_padding_{backend}_{io_type}_{strategy}_{conv_implementation}' + / f'hls4mlprj_pointwise1d_{chans}_strides_{strides[0]}_{padds}_padding_{backend}_{io_type}_{strategy}_{conv_impl}' ) hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend From a01080dc210ef23640b766f0b9a24090ac540f58 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sat, 7 Oct 2023 22:09:14 -0700 Subject: [PATCH 11/41] use code gen --- hls4ml/backends/fpga/fpga_backend.py | 57 +++ hls4ml/backends/fpga/passes/codegen.py | 22 ++ .../vivado/passes/convolution_templates.py | 7 + .../vivado/nnet_utils/nnet_code_gen.h | 90 +++++ .../templates/vivado/nnet_utils/nnet_conv1d.h | 4 +- .../vivado/nnet_utils/nnet_conv1d_latency.h | 351 ------------------ test/pytest/test_pointwiseconv.py | 7 +- 7 files changed, 183 insertions(+), 355 deletions(-) diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index 8cfaec8b3f..349a5ddbc8 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -860,6 +860,63 @@ def generate_conv2d_line_buffer_fn( return generated_code + def generate_pointwise_conv1d_fn(self, layer_idx, reuse_factor=1): + """Generate a C++ function for a pointwise convolution layer. + + Args: + layer_idx (int): Index of layer ('index' attribute). + reuse_factor (int): Number of partitions to divide the input into. + + Returns: + str: Generated C++ function + """ + + generated_code = ( + "template\n" + "class pointwise_conv_{index} : public PointwiseConv1D {{\n" + " public:\n" + " static void pointwise_conv(\n" + " data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n" + " res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n" + " typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n" + " typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n" + " data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n" + " #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n" + " res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n" + " #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n" + "RFInputLoop:\n" + " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n" + " #pragma HLS UNROLL\n" + " InnerInputLoop:\n" + " for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n" + " #pragma HLS UNROLL\n" + " data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];" + "\n" + " }}\n" + " }}\n\n" + ).format(index=layer_idx) + for i in range(reuse_factor): + generated_code += ( + f" pointwise_conv_1d_latency_cl(data_tmp[{i}], res_tmp[{i}], weights, biases);\n" + ) + + generated_code += ( + "\n" + "RFOutputLoop:\n" + " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n" + " #pragma HLS UNROLL\n" + " InnerOutputLoop:\n" + " for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n" + " #pragma HLS UNROLL\n" + " res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n" + " }\n" + " }\n" + "}\n" + "};\n" + ) + + return generated_code + @model_optimizer() def write_hls(self, model): self.writer.write_hls(model) diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/codegen.py index f1f1080996..6d7243dd8b 100644 --- a/hls4ml/backends/fpga/passes/codegen.py +++ b/hls4ml/backends/fpga/passes/codegen.py @@ -49,3 +49,25 @@ def _generate_im2col_2d(self, node): ) node.set_attr('line_buffer_codegen', Source(code_str)) + + +class GeneratePointwiseConv1D(OptimizerPass): + '''Generates code for pointwise 1D convolution''' + + def match(self, node): + return isinstance(node, Conv1D) and node.model.config.get_config_value('IOType') == 'io_parallel' + + def transform(self, model, node): + node_class = node.__class__.__name__ + if '1D' in node_class: + self._generate_pointwise_conv1d(node) + else: + raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})') + + def _generate_pointwise_conv1d(self, node): + code_str = node.model.config.backend.generate_pointwise_conv1d_fn( + node.get_attr('index'), + node.get_attr('reuse_factor'), + ) + + node.set_attr('pointwise_conv1d_codegen', Source(code_str)) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 874349aab3..a4fbdd405f 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -56,6 +56,8 @@ typedef {config_t} mult_config; template using scale_index = nnet::{scale_index_type}; + template + using pointwise_conv = nnet::{pointwise_fn}; }}; const ap_uint config{index}::pixels[] = {{{instructions}}};\n""" @@ -89,6 +91,11 @@ def format(self, node): else: params['fill_fn'] = 'FillConv1DBuffer' + if node.get_attr('filt_width') == 1 and node.model.config.get_config_value('IOType') == 'io_parallel': + params['pointwise_fn'] = f'pointwise_conv_{node.index}' + else: + params['pointwise_fn'] = 'PointwiseConv1D' + conv_config = self.template.format(**params) mult_params = self._default_config_params(node) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h index e4db43682e..32fa7321c5 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h @@ -25,6 +25,96 @@ template class FillConv2DBuffer { } }; +template class PointwiseConv1D { + public: + static void pointwise_conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + // To be implemented in subclasses + } +}; + +template +void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_width == 1); + + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; + + #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 + #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + #pragma HLS function_instantiate variable=weights,biases + + // Parallel mode + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + #pragma HLS ARRAY_PARTITION variable=weights complete dim=0 + #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 + + // Limit multipliers to control parallelization + int multiplier_limit = + ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / + float(CONFIG_T::reuse_factor)); +#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit + +// Convolve, saving all multiplication results to accumulate later +ConvOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + ConvFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + #pragma HLS UNROLL + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + int index_weight = cc * CONFIG_T::n_filt + ff; + int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + + if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left || + (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { + mult[index_mult] = 0; + } else { + mult[index_mult] = data[index_data] * weights[index_weight]; + } + } // end channel loop + } // end filter loop + } // end output loop + + // Initialize accumulator with input biases + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL + acc[ii][ff] = biases[ff]; + } + } + +// Accumulate multiplication result +AccumOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + AccumFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Do "dot product" sum within filter and sum over channels + AccumChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + acc[ii][ff] += mult[index_mult]; + } // end channel loop + } // end filter loop + } // end output loop + + // Cast to "res_t" type + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL + res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); + } + } +} + // hls4ml insert code } // namespace nnet diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index 0f2e89ac8f..7cceabfe1b 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -56,8 +56,8 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], if (CONFIG_T::strategy == nnet::latency) { if (CONFIG_T::implementation == conv_implementation::pointwise) { // Use pointwise unrolled implementation - if (CONFIG_T::reuse_factor > 1 && CONFIG_T::reuse_factor <= 120) { - pointwise_conv_1d_latency_cl_split_by_rf(data, res, weights, biases); + if (CONFIG_T::reuse_factor > 1) { + CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); } else { assert(CONFIG_T::reuse_factor == 1); pointwise_conv_1d_latency_cl(data, res, weights, biases); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index aabc869823..0d9afb10cb 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -84,356 +84,5 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } } -template -void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - assert(CONFIG_T::filt_width == 1); - - typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; - typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; - - #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 - #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 - - // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases - #pragma HLS function_instantiate variable=weights,biases - - // Parallel mode - #pragma HLS PIPELINE II=CONFIG_T::reuse_factor - #pragma HLS ARRAY_PARTITION variable=weights complete dim=0 - #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 - - // Limit multipliers to control parallelization - int multiplier_limit = - ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / - float(CONFIG_T::reuse_factor)); -#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit - -// Convolve, saving all multiplication results to accumulate later -ConvOut: - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { - ConvFilt: - for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - ConvChan: - for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { - #pragma HLS UNROLL - int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; - int index_weight = cc * CONFIG_T::n_filt + ff; - int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; - - if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left || - (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { - mult[index_mult] = 0; - } else { - mult[index_mult] = data[index_data] * weights[index_weight]; - } - } // end channel loop - } // end filter loop - } // end output loop - - // Initialize accumulator with input biases - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { - for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - #pragma HLS UNROLL - acc[ii][ff] = biases[ff]; - } - } - -// Accumulate multiplication result -AccumOut: - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { - AccumFilt: - for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - // Do "dot product" sum within filter and sum over channels - AccumChan: - for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { - int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; - acc[ii][ff] += mult[index_mult]; - } // end channel loop - } // end filter loop - } // end output loop - - // Cast to "res_t" type - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { - for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - #pragma HLS UNROLL - res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); - } - } -} - -template -void pointwise_conv_1d_latency_cl_split_by_rf(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - - data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; - #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0 - res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor]; - #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0 - -RFInputLoop: - for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { - #pragma HLS UNROLL - InnerInputLoop: - for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) { - #pragma HLS UNROLL - data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii]; - } - } - - pointwise_conv_1d_latency_cl(data_tmp[0], res_tmp[0], weights, biases); - pointwise_conv_1d_latency_cl(data_tmp[1], res_tmp[1], weights, biases); - if (CONFIG_T::reuse_factor > 2) - pointwise_conv_1d_latency_cl(data_tmp[2], res_tmp[2], weights, biases); - if (CONFIG_T::reuse_factor > 3) - pointwise_conv_1d_latency_cl(data_tmp[3], res_tmp[3], weights, biases); - if (CONFIG_T::reuse_factor > 4) - pointwise_conv_1d_latency_cl(data_tmp[4], res_tmp[4], weights, biases); - if (CONFIG_T::reuse_factor > 5) - pointwise_conv_1d_latency_cl(data_tmp[5], res_tmp[5], weights, biases); - if (CONFIG_T::reuse_factor > 6) - pointwise_conv_1d_latency_cl(data_tmp[6], res_tmp[6], weights, biases); - if (CONFIG_T::reuse_factor > 7) - pointwise_conv_1d_latency_cl(data_tmp[7], res_tmp[7], weights, biases); - if (CONFIG_T::reuse_factor > 8) - pointwise_conv_1d_latency_cl(data_tmp[8], res_tmp[8], weights, biases); - if (CONFIG_T::reuse_factor > 9) - pointwise_conv_1d_latency_cl(data_tmp[9], res_tmp[9], weights, biases); - if (CONFIG_T::reuse_factor > 10) - pointwise_conv_1d_latency_cl(data_tmp[10], res_tmp[10], weights, biases); - if (CONFIG_T::reuse_factor > 11) - pointwise_conv_1d_latency_cl(data_tmp[11], res_tmp[11], weights, biases); - if (CONFIG_T::reuse_factor > 12) - pointwise_conv_1d_latency_cl(data_tmp[12], res_tmp[12], weights, biases); - if (CONFIG_T::reuse_factor > 13) - pointwise_conv_1d_latency_cl(data_tmp[13], res_tmp[13], weights, biases); - if (CONFIG_T::reuse_factor > 14) - pointwise_conv_1d_latency_cl(data_tmp[14], res_tmp[14], weights, biases); - if (CONFIG_T::reuse_factor > 15) - pointwise_conv_1d_latency_cl(data_tmp[15], res_tmp[15], weights, biases); - if (CONFIG_T::reuse_factor > 16) - pointwise_conv_1d_latency_cl(data_tmp[16], res_tmp[16], weights, biases); - if (CONFIG_T::reuse_factor > 17) - pointwise_conv_1d_latency_cl(data_tmp[17], res_tmp[17], weights, biases); - if (CONFIG_T::reuse_factor > 18) - pointwise_conv_1d_latency_cl(data_tmp[18], res_tmp[18], weights, biases); - if (CONFIG_T::reuse_factor > 19) - pointwise_conv_1d_latency_cl(data_tmp[19], res_tmp[19], weights, biases); - if (CONFIG_T::reuse_factor > 20) - pointwise_conv_1d_latency_cl(data_tmp[20], res_tmp[20], weights, biases); - if (CONFIG_T::reuse_factor > 21) - pointwise_conv_1d_latency_cl(data_tmp[21], res_tmp[21], weights, biases); - if (CONFIG_T::reuse_factor > 22) - pointwise_conv_1d_latency_cl(data_tmp[22], res_tmp[22], weights, biases); - if (CONFIG_T::reuse_factor > 23) - pointwise_conv_1d_latency_cl(data_tmp[23], res_tmp[23], weights, biases); - if (CONFIG_T::reuse_factor > 24) - pointwise_conv_1d_latency_cl(data_tmp[24], res_tmp[24], weights, biases); - if (CONFIG_T::reuse_factor > 25) - pointwise_conv_1d_latency_cl(data_tmp[25], res_tmp[25], weights, biases); - if (CONFIG_T::reuse_factor > 26) - pointwise_conv_1d_latency_cl(data_tmp[26], res_tmp[26], weights, biases); - if (CONFIG_T::reuse_factor > 27) - pointwise_conv_1d_latency_cl(data_tmp[27], res_tmp[27], weights, biases); - if (CONFIG_T::reuse_factor > 28) - pointwise_conv_1d_latency_cl(data_tmp[28], res_tmp[28], weights, biases); - if (CONFIG_T::reuse_factor > 29) - pointwise_conv_1d_latency_cl(data_tmp[29], res_tmp[29], weights, biases); - if (CONFIG_T::reuse_factor > 30) - pointwise_conv_1d_latency_cl(data_tmp[30], res_tmp[30], weights, biases); - if (CONFIG_T::reuse_factor > 31) - pointwise_conv_1d_latency_cl(data_tmp[31], res_tmp[31], weights, biases); - if (CONFIG_T::reuse_factor > 32) - pointwise_conv_1d_latency_cl(data_tmp[32], res_tmp[32], weights, biases); - if (CONFIG_T::reuse_factor > 33) - pointwise_conv_1d_latency_cl(data_tmp[33], res_tmp[33], weights, biases); - if (CONFIG_T::reuse_factor > 34) - pointwise_conv_1d_latency_cl(data_tmp[34], res_tmp[34], weights, biases); - if (CONFIG_T::reuse_factor > 35) - pointwise_conv_1d_latency_cl(data_tmp[35], res_tmp[35], weights, biases); - if (CONFIG_T::reuse_factor > 36) - pointwise_conv_1d_latency_cl(data_tmp[36], res_tmp[36], weights, biases); - if (CONFIG_T::reuse_factor > 37) - pointwise_conv_1d_latency_cl(data_tmp[37], res_tmp[37], weights, biases); - if (CONFIG_T::reuse_factor > 38) - pointwise_conv_1d_latency_cl(data_tmp[38], res_tmp[38], weights, biases); - if (CONFIG_T::reuse_factor > 39) - pointwise_conv_1d_latency_cl(data_tmp[39], res_tmp[39], weights, biases); - if (CONFIG_T::reuse_factor > 40) - pointwise_conv_1d_latency_cl(data_tmp[40], res_tmp[40], weights, biases); - if (CONFIG_T::reuse_factor > 41) - pointwise_conv_1d_latency_cl(data_tmp[41], res_tmp[41], weights, biases); - if (CONFIG_T::reuse_factor > 42) - pointwise_conv_1d_latency_cl(data_tmp[42], res_tmp[42], weights, biases); - if (CONFIG_T::reuse_factor > 43) - pointwise_conv_1d_latency_cl(data_tmp[43], res_tmp[43], weights, biases); - if (CONFIG_T::reuse_factor > 44) - pointwise_conv_1d_latency_cl(data_tmp[44], res_tmp[44], weights, biases); - if (CONFIG_T::reuse_factor > 45) - pointwise_conv_1d_latency_cl(data_tmp[45], res_tmp[45], weights, biases); - if (CONFIG_T::reuse_factor > 46) - pointwise_conv_1d_latency_cl(data_tmp[46], res_tmp[45], weights, biases); - if (CONFIG_T::reuse_factor > 47) - pointwise_conv_1d_latency_cl(data_tmp[47], res_tmp[47], weights, biases); - if (CONFIG_T::reuse_factor > 48) - pointwise_conv_1d_latency_cl(data_tmp[48], res_tmp[48], weights, biases); - if (CONFIG_T::reuse_factor > 49) - pointwise_conv_1d_latency_cl(data_tmp[49], res_tmp[49], weights, biases); - if (CONFIG_T::reuse_factor > 50) - pointwise_conv_1d_latency_cl(data_tmp[50], res_tmp[50], weights, biases); - if (CONFIG_T::reuse_factor > 51) - pointwise_conv_1d_latency_cl(data_tmp[51], res_tmp[51], weights, biases); - if (CONFIG_T::reuse_factor > 52) - pointwise_conv_1d_latency_cl(data_tmp[52], res_tmp[52], weights, biases); - if (CONFIG_T::reuse_factor > 53) - pointwise_conv_1d_latency_cl(data_tmp[53], res_tmp[53], weights, biases); - if (CONFIG_T::reuse_factor > 54) - pointwise_conv_1d_latency_cl(data_tmp[54], res_tmp[54], weights, biases); - if (CONFIG_T::reuse_factor > 55) - pointwise_conv_1d_latency_cl(data_tmp[55], res_tmp[55], weights, biases); - if (CONFIG_T::reuse_factor > 56) - pointwise_conv_1d_latency_cl(data_tmp[56], res_tmp[55], weights, biases); - if (CONFIG_T::reuse_factor > 57) - pointwise_conv_1d_latency_cl(data_tmp[57], res_tmp[57], weights, biases); - if (CONFIG_T::reuse_factor > 58) - pointwise_conv_1d_latency_cl(data_tmp[58], res_tmp[58], weights, biases); - if (CONFIG_T::reuse_factor > 59) - pointwise_conv_1d_latency_cl(data_tmp[59], res_tmp[59], weights, biases); - if (CONFIG_T::reuse_factor > 60) - pointwise_conv_1d_latency_cl(data_tmp[60], res_tmp[60], weights, biases); - if (CONFIG_T::reuse_factor > 61) - pointwise_conv_1d_latency_cl(data_tmp[61], res_tmp[61], weights, biases); - if (CONFIG_T::reuse_factor > 62) - pointwise_conv_1d_latency_cl(data_tmp[62], res_tmp[62], weights, biases); - if (CONFIG_T::reuse_factor > 63) - pointwise_conv_1d_latency_cl(data_tmp[63], res_tmp[63], weights, biases); - if (CONFIG_T::reuse_factor > 64) - pointwise_conv_1d_latency_cl(data_tmp[64], res_tmp[64], weights, biases); - if (CONFIG_T::reuse_factor > 65) - pointwise_conv_1d_latency_cl(data_tmp[65], res_tmp[65], weights, biases); - if (CONFIG_T::reuse_factor > 66) - pointwise_conv_1d_latency_cl(data_tmp[66], res_tmp[66], weights, biases); - if (CONFIG_T::reuse_factor > 67) - pointwise_conv_1d_latency_cl(data_tmp[67], res_tmp[67], weights, biases); - if (CONFIG_T::reuse_factor > 68) - pointwise_conv_1d_latency_cl(data_tmp[68], res_tmp[68], weights, biases); - if (CONFIG_T::reuse_factor > 69) - pointwise_conv_1d_latency_cl(data_tmp[69], res_tmp[69], weights, biases); - if (CONFIG_T::reuse_factor > 70) - pointwise_conv_1d_latency_cl(data_tmp[70], res_tmp[70], weights, biases); - if (CONFIG_T::reuse_factor > 71) - pointwise_conv_1d_latency_cl(data_tmp[71], res_tmp[71], weights, biases); - if (CONFIG_T::reuse_factor > 72) - pointwise_conv_1d_latency_cl(data_tmp[72], res_tmp[72], weights, biases); - if (CONFIG_T::reuse_factor > 73) - pointwise_conv_1d_latency_cl(data_tmp[73], res_tmp[73], weights, biases); - if (CONFIG_T::reuse_factor > 74) - pointwise_conv_1d_latency_cl(data_tmp[74], res_tmp[74], weights, biases); - if (CONFIG_T::reuse_factor > 75) - pointwise_conv_1d_latency_cl(data_tmp[75], res_tmp[75], weights, biases); - if (CONFIG_T::reuse_factor > 76) - pointwise_conv_1d_latency_cl(data_tmp[76], res_tmp[76], weights, biases); - if (CONFIG_T::reuse_factor > 77) - pointwise_conv_1d_latency_cl(data_tmp[77], res_tmp[77], weights, biases); - if (CONFIG_T::reuse_factor > 78) - pointwise_conv_1d_latency_cl(data_tmp[78], res_tmp[78], weights, biases); - if (CONFIG_T::reuse_factor > 79) - pointwise_conv_1d_latency_cl(data_tmp[79], res_tmp[79], weights, biases); - if (CONFIG_T::reuse_factor > 80) - pointwise_conv_1d_latency_cl(data_tmp[80], res_tmp[80], weights, biases); - if (CONFIG_T::reuse_factor > 81) - pointwise_conv_1d_latency_cl(data_tmp[81], res_tmp[81], weights, biases); - if (CONFIG_T::reuse_factor > 82) - pointwise_conv_1d_latency_cl(data_tmp[82], res_tmp[82], weights, biases); - if (CONFIG_T::reuse_factor > 83) - pointwise_conv_1d_latency_cl(data_tmp[83], res_tmp[83], weights, biases); - if (CONFIG_T::reuse_factor > 84) - pointwise_conv_1d_latency_cl(data_tmp[84], res_tmp[84], weights, biases); - if (CONFIG_T::reuse_factor > 85) - pointwise_conv_1d_latency_cl(data_tmp[85], res_tmp[85], weights, biases); - if (CONFIG_T::reuse_factor > 86) - pointwise_conv_1d_latency_cl(data_tmp[86], res_tmp[86], weights, biases); - if (CONFIG_T::reuse_factor > 87) - pointwise_conv_1d_latency_cl(data_tmp[87], res_tmp[87], weights, biases); - if (CONFIG_T::reuse_factor > 88) - pointwise_conv_1d_latency_cl(data_tmp[88], res_tmp[88], weights, biases); - if (CONFIG_T::reuse_factor > 89) - pointwise_conv_1d_latency_cl(data_tmp[89], res_tmp[89], weights, biases); - if (CONFIG_T::reuse_factor > 90) - pointwise_conv_1d_latency_cl(data_tmp[90], res_tmp[90], weights, biases); - if (CONFIG_T::reuse_factor > 91) - pointwise_conv_1d_latency_cl(data_tmp[91], res_tmp[91], weights, biases); - if (CONFIG_T::reuse_factor > 92) - pointwise_conv_1d_latency_cl(data_tmp[92], res_tmp[92], weights, biases); - if (CONFIG_T::reuse_factor > 93) - pointwise_conv_1d_latency_cl(data_tmp[93], res_tmp[93], weights, biases); - if (CONFIG_T::reuse_factor > 94) - pointwise_conv_1d_latency_cl(data_tmp[94], res_tmp[94], weights, biases); - if (CONFIG_T::reuse_factor > 95) - pointwise_conv_1d_latency_cl(data_tmp[95], res_tmp[95], weights, biases); - if (CONFIG_T::reuse_factor > 96) - pointwise_conv_1d_latency_cl(data_tmp[96], res_tmp[96], weights, biases); - if (CONFIG_T::reuse_factor > 97) - pointwise_conv_1d_latency_cl(data_tmp[97], res_tmp[97], weights, biases); - if (CONFIG_T::reuse_factor > 98) - pointwise_conv_1d_latency_cl(data_tmp[98], res_tmp[98], weights, biases); - if (CONFIG_T::reuse_factor > 99) - pointwise_conv_1d_latency_cl(data_tmp[99], res_tmp[99], weights, biases); - if (CONFIG_T::reuse_factor > 100) - pointwise_conv_1d_latency_cl(data_tmp[100], res_tmp[100], weights, biases); - if (CONFIG_T::reuse_factor > 101) - pointwise_conv_1d_latency_cl(data_tmp[101], res_tmp[101], weights, biases); - if (CONFIG_T::reuse_factor > 102) - pointwise_conv_1d_latency_cl(data_tmp[102], res_tmp[102], weights, biases); - if (CONFIG_T::reuse_factor > 103) - pointwise_conv_1d_latency_cl(data_tmp[103], res_tmp[103], weights, biases); - if (CONFIG_T::reuse_factor > 104) - pointwise_conv_1d_latency_cl(data_tmp[104], res_tmp[104], weights, biases); - if (CONFIG_T::reuse_factor > 105) - pointwise_conv_1d_latency_cl(data_tmp[105], res_tmp[105], weights, biases); - if (CONFIG_T::reuse_factor > 106) - pointwise_conv_1d_latency_cl(data_tmp[106], res_tmp[106], weights, biases); - if (CONFIG_T::reuse_factor > 107) - pointwise_conv_1d_latency_cl(data_tmp[107], res_tmp[107], weights, biases); - if (CONFIG_T::reuse_factor > 108) - pointwise_conv_1d_latency_cl(data_tmp[108], res_tmp[108], weights, biases); - if (CONFIG_T::reuse_factor > 109) - pointwise_conv_1d_latency_cl(data_tmp[109], res_tmp[109], weights, biases); - if (CONFIG_T::reuse_factor > 110) - pointwise_conv_1d_latency_cl(data_tmp[110], res_tmp[110], weights, biases); - if (CONFIG_T::reuse_factor > 111) - pointwise_conv_1d_latency_cl(data_tmp[111], res_tmp[111], weights, biases); - if (CONFIG_T::reuse_factor > 112) - pointwise_conv_1d_latency_cl(data_tmp[112], res_tmp[112], weights, biases); - if (CONFIG_T::reuse_factor > 113) - pointwise_conv_1d_latency_cl(data_tmp[113], res_tmp[113], weights, biases); - if (CONFIG_T::reuse_factor > 114) - pointwise_conv_1d_latency_cl(data_tmp[114], res_tmp[114], weights, biases); - if (CONFIG_T::reuse_factor > 115) - pointwise_conv_1d_latency_cl(data_tmp[115], res_tmp[115], weights, biases); - if (CONFIG_T::reuse_factor > 116) - pointwise_conv_1d_latency_cl(data_tmp[116], res_tmp[116], weights, biases); - if (CONFIG_T::reuse_factor > 117) - pointwise_conv_1d_latency_cl(data_tmp[117], res_tmp[117], weights, biases); - if (CONFIG_T::reuse_factor > 118) - pointwise_conv_1d_latency_cl(data_tmp[118], res_tmp[118], weights, biases); - if (CONFIG_T::reuse_factor > 119) - pointwise_conv_1d_latency_cl(data_tmp[119], res_tmp[119], weights, biases); - -RFOutputLoop: - for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { - #pragma HLS UNROLL - InnerOutputLoop: - for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) { - #pragma HLS UNROLL - res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii]; - } - } -} - } // namespace nnet #endif diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index cbe2036712..a7ad3437b2 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -15,11 +15,13 @@ strides1d_options = [(1,), (2,)] strides2d_options = [(1, 1), (2, 2)] strategy_options = ['Latency', 'Resource'] +rf_options = [1, 2] @pytest.mark.parametrize('chans', chans_options) @pytest.mark.parametrize('padds', padds_options) @pytest.mark.parametrize('strides', strides1d_options) +@pytest.mark.parametrize('rf', rf_options) @pytest.mark.parametrize( 'backend, io_type, strategy, conv_impl', [ @@ -36,7 +38,7 @@ ('Vitis', 'io_stream', 'resource', 'LineBuffer'), ], ) -def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_impl): +def test_pointwiseconv1d(chans, padds, strides, rf, backend, io_type, strategy, conv_impl): model = tf.keras.models.Sequential() input_shape = (28, 3) model.add( @@ -61,10 +63,11 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision, granularity='name') config['Model']['Strategy'] = strategy config['LayerName']['pointwise1d']['ConvImplementation'] = conv_impl + config['LayerName']['pointwise1d']['ReuseFactor'] = rf output_dir = str( test_root_path - / f'hls4mlprj_pointwise1d_{chans}_strides_{strides[0]}_{padds}_padding_{backend}_{io_type}_{strategy}_{conv_impl}' + / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{rf}_{backend}_{io_type}_{strategy}_{conv_impl}' ) hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend From 30c5c70f649553ab11611f6b02f8ab84bd86e801 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 8 Oct 2023 05:40:04 -0700 Subject: [PATCH 12/41] fix indent --- hls4ml/backends/fpga/fpga_backend.py | 57 ++++++++++++++-------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index 349a5ddbc8..35151af348 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -874,44 +874,45 @@ def generate_pointwise_conv1d_fn(self, layer_idx, reuse_factor=1): generated_code = ( "template\n" "class pointwise_conv_{index} : public PointwiseConv1D {{\n" - " public:\n" + " public:\n" " static void pointwise_conv(\n" - " data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n" - " res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n" - " typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n" - " typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n" - " data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n" - " #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n" - " res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n" - " #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n" - "RFInputLoop:\n" - " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n" - " #pragma HLS UNROLL\n" - " InnerInputLoop:\n" - " for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n" - " #pragma HLS UNROLL\n" - " data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];" - "\n" - " }}\n" - " }}\n\n" + " data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n" + " res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n" + " typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n" + " typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n" + " data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n" # noqa: E501 + " #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n" + " res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n" # noqa: E501 + " #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n" + " RFInputLoop:\n" + " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n" + " #pragma HLS UNROLL\n" + " InnerInputLoop:\n" + " for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n" + " #pragma HLS UNROLL\n" + " data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n" # noqa: E501 + " }}\n" + " }}\n\n" ).format(index=layer_idx) + indent = " " for i in range(reuse_factor): + generated_code += indent generated_code += ( - f" pointwise_conv_1d_latency_cl(data_tmp[{i}], res_tmp[{i}], weights, biases);\n" + f"pointwise_conv_1d_latency_cl(data_tmp[{i}], res_tmp[{i}], weights, biases);\n" ) generated_code += ( "\n" - "RFOutputLoop:\n" - " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n" - " #pragma HLS UNROLL\n" - " InnerOutputLoop:\n" - " for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n" - " #pragma HLS UNROLL\n" - " res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n" + " RFOutputLoop:\n" + " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n" + " #pragma HLS UNROLL\n" + " InnerOutputLoop:\n" + " for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n" + " #pragma HLS UNROLL\n" + " res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n" # noqa: E501 + " }\n" " }\n" " }\n" - "}\n" "};\n" ) From a05bf69ebc99d7ce448db3f89398d615a52fe369 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Mon, 9 Oct 2023 13:28:57 -0700 Subject: [PATCH 13/41] update rf --- test/pytest/test_pointwiseconv.py | 32 +++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index a7ad3437b2..79fce34103 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -15,30 +15,30 @@ strides1d_options = [(1,), (2,)] strides2d_options = [(1, 1), (2, 2)] strategy_options = ['Latency', 'Resource'] -rf_options = [1, 2] @pytest.mark.parametrize('chans', chans_options) @pytest.mark.parametrize('padds', padds_options) @pytest.mark.parametrize('strides', strides1d_options) -@pytest.mark.parametrize('rf', rf_options) @pytest.mark.parametrize( - 'backend, io_type, strategy, conv_impl', + 'backend, io_type, strategy, conv_impl, rf', [ - ('Quartus', 'io_parallel', 'resource', 'LineBuffer'), - ('Vivado', 'io_parallel', 'resource', 'LineBuffer'), - ('Vitis', 'io_parallel', 'resource', 'LineBuffer'), - ('Vivado', 'io_parallel', 'latency', 'LineBuffer'), - ('Vitis', 'io_parallel', 'latency', 'LineBuffer'), - ('Vivado', 'io_parallel', 'latency', 'Pointwise'), - ('Vitis', 'io_parallel', 'latency', 'Pointwise'), - ('Vivado', 'io_stream', 'latency', 'LineBuffer'), - ('Vivado', 'io_stream', 'resource', 'LineBuffer'), - ('Vitis', 'io_stream', 'latency', 'LineBuffer'), - ('Vitis', 'io_stream', 'resource', 'LineBuffer'), + ('Quartus', 'io_parallel', 'resource', 'LineBuffer', 1), + ('Vivado', 'io_parallel', 'resource', 'LineBuffer', 1), + ('Vitis', 'io_parallel', 'resource', 'LineBuffer', 1), + ('Vivado', 'io_parallel', 'latency', 'LineBuffer', 1), + ('Vitis', 'io_parallel', 'latency', 'LineBuffer', 1), + ('Vivado', 'io_parallel', 'latency', 'Pointwise', 1), + ('Vivado', 'io_parallel', 'latency', 'Pointwise', 14), + ('Vitis', 'io_parallel', 'latency', 'Pointwise', 1), + ('Vitis', 'io_parallel', 'latency', 'Pointwise', 14), + ('Vivado', 'io_stream', 'latency', 'LineBuffer', 1), + ('Vivado', 'io_stream', 'resource', 'LineBuffer', 1), + ('Vitis', 'io_stream', 'latency', 'LineBuffer', 1), + ('Vitis', 'io_stream', 'resource', 'LineBuffer', 1), ], ) -def test_pointwiseconv1d(chans, padds, strides, rf, backend, io_type, strategy, conv_impl): +def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_impl, rf): model = tf.keras.models.Sequential() input_shape = (28, 3) model.add( @@ -67,7 +67,7 @@ def test_pointwiseconv1d(chans, padds, strides, rf, backend, io_type, strategy, output_dir = str( test_root_path - / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{rf}_{backend}_{io_type}_{strategy}_{conv_impl}' + / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{backend}_{io_type}_{strategy}_{conv_impl}_rf{rf}' ) hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend From 445b2cd8744d3ba7928a69a1f556fe5c82c0e6d8 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 11 Oct 2023 19:58:42 -0700 Subject: [PATCH 14/41] address vlad comments part 1 --- hls4ml/backends/fpga/fpga_backend.py | 58 ------------- .../passes/{codegen.py => im2col_codegen.py} | 0 .../vivado/passes/pointwise_codegen.py | 25 ++++++ hls4ml/backends/vivado/vivado_backend.py | 58 +++++++++++++ hls4ml/templates/vivado/build_prj.tcl | 2 +- .../vivado/nnet_utils/nnet_code_gen.h | 81 +------------------ .../templates/vivado/nnet_utils/nnet_common.h | 1 + .../vivado/nnet_utils/nnet_conv1d_latency.h | 80 ++++++++++++++++++ hls4ml/writer/vivado_writer.py | 2 + 9 files changed, 168 insertions(+), 139 deletions(-) rename hls4ml/backends/fpga/passes/{codegen.py => im2col_codegen.py} (100%) create mode 100644 hls4ml/backends/vivado/passes/pointwise_codegen.py diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index 35151af348..8cfaec8b3f 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -860,64 +860,6 @@ def generate_conv2d_line_buffer_fn( return generated_code - def generate_pointwise_conv1d_fn(self, layer_idx, reuse_factor=1): - """Generate a C++ function for a pointwise convolution layer. - - Args: - layer_idx (int): Index of layer ('index' attribute). - reuse_factor (int): Number of partitions to divide the input into. - - Returns: - str: Generated C++ function - """ - - generated_code = ( - "template\n" - "class pointwise_conv_{index} : public PointwiseConv1D {{\n" - " public:\n" - " static void pointwise_conv(\n" - " data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n" - " res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n" - " typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n" - " typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n" - " data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n" # noqa: E501 - " #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n" - " res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n" # noqa: E501 - " #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n" - " RFInputLoop:\n" - " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n" - " #pragma HLS UNROLL\n" - " InnerInputLoop:\n" - " for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n" - " #pragma HLS UNROLL\n" - " data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n" # noqa: E501 - " }}\n" - " }}\n\n" - ).format(index=layer_idx) - indent = " " - for i in range(reuse_factor): - generated_code += indent - generated_code += ( - f"pointwise_conv_1d_latency_cl(data_tmp[{i}], res_tmp[{i}], weights, biases);\n" - ) - - generated_code += ( - "\n" - " RFOutputLoop:\n" - " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n" - " #pragma HLS UNROLL\n" - " InnerOutputLoop:\n" - " for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n" - " #pragma HLS UNROLL\n" - " res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n" # noqa: E501 - " }\n" - " }\n" - " }\n" - "};\n" - ) - - return generated_code - @model_optimizer() def write_hls(self, model): self.writer.write_hls(model) diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/im2col_codegen.py similarity index 100% rename from hls4ml/backends/fpga/passes/codegen.py rename to hls4ml/backends/fpga/passes/im2col_codegen.py diff --git a/hls4ml/backends/vivado/passes/pointwise_codegen.py b/hls4ml/backends/vivado/passes/pointwise_codegen.py new file mode 100644 index 0000000000..f459d59208 --- /dev/null +++ b/hls4ml/backends/vivado/passes/pointwise_codegen.py @@ -0,0 +1,25 @@ +from hls4ml.model.layers import Conv1D +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import Source + + +class GeneratePointwiseConv1D(OptimizerPass): + '''Generates code for pointwise 1D convolution''' + + def match(self, node): + return isinstance(node, Conv1D) and node.model.config.get_config_value('IOType') == 'io_parallel' + + def transform(self, model, node): + node_class = node.__class__.__name__ + if '1D' in node_class: + self._generate_pointwise_conv1d(node) + else: + raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})') + + def _generate_pointwise_conv1d(self, node): + code_str = node.model.config.backend.generate_pointwise_conv1d_fn( + node.get_attr('index'), + node.get_attr('reuse_factor'), + ) + + node.set_attr('pointwise_conv1d_codegen', Source(code_str)) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 011d576f64..8db278be9b 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -474,3 +474,61 @@ def init_garnet(self, layer): @layer_optimizer(GarNetStack) def init_garnet_stack(self, layer): self.init_garnet(layer) + + def generate_pointwise_conv1d_fn(self, layer_idx, reuse_factor=1): + """Generate a C++ function for a pointwise convolution layer. + + Args: + layer_idx (int): Index of layer ('index' attribute). + reuse_factor (int): Number of partitions to divide the input into. + + Returns: + str: Generated C++ function + """ + + generated_code = ( + "template\n" + "class pointwise_conv_{index} : public PointwiseConv1D {{\n" + " public:\n" + " static void pointwise_conv(\n" + " data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n" + " res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n" + " typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n" + " typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n" + " data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n" # noqa: E501 + " #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n" + " res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n" # noqa: E501 + " #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n" + " RFInputLoop:\n" + " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n" + " #pragma HLS UNROLL\n" + " InnerInputLoop:\n" + " for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n" + " #pragma HLS UNROLL\n" + " data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n" # noqa: E501 + " }}\n" + " }}\n\n" + ).format(index=layer_idx) + indent = " " + for i in range(reuse_factor): + generated_code += indent + generated_code += ( + f"pointwise_conv_1d_latency_cl(data_tmp[{i}], res_tmp[{i}], weights, biases);\n" + ) + + generated_code += ( + "\n" + " RFOutputLoop:\n" + " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n" + " #pragma HLS UNROLL\n" + " InnerOutputLoop:\n" + " for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n" + " #pragma HLS UNROLL\n" + " res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n" # noqa: E501 + " }\n" + " }\n" + " }\n" + "};\n" + ) + + return generated_code diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl index 82b3c5a640..4ef8032d4f 100644 --- a/hls4ml/templates/vivado/build_prj.tcl +++ b/hls4ml/templates/vivado/build_prj.tcl @@ -161,7 +161,7 @@ if {$opt(reset)} { } else { open_solution "solution1" } -catch {config_array_partition -maximum_size 8192} +catch {config_array_partition -maximum_size $maximum_size} config_compile -name_max_length 80 set_part $part config_schedule -enable_dsp_full_reg=false diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h index 32fa7321c5..1900aa2716 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h @@ -1,6 +1,7 @@ #ifndef NNET_INSTR_GEN_H_ #define NNET_INSTR_GEN_H_ +#include "nnet_conv1d_latency.h" #include "nnet_helpers.h" #include @@ -35,86 +36,6 @@ template class PointwiseConv1D { } }; -template -void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - assert(CONFIG_T::filt_width == 1); - - typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; - typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; - - #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 - #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 - - // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases - #pragma HLS function_instantiate variable=weights,biases - - // Parallel mode - #pragma HLS PIPELINE II=CONFIG_T::reuse_factor - #pragma HLS ARRAY_PARTITION variable=weights complete dim=0 - #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 - - // Limit multipliers to control parallelization - int multiplier_limit = - ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / - float(CONFIG_T::reuse_factor)); -#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit - -// Convolve, saving all multiplication results to accumulate later -ConvOut: - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { - ConvFilt: - for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - ConvChan: - for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { - #pragma HLS UNROLL - int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; - int index_weight = cc * CONFIG_T::n_filt + ff; - int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; - - if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left || - (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { - mult[index_mult] = 0; - } else { - mult[index_mult] = data[index_data] * weights[index_weight]; - } - } // end channel loop - } // end filter loop - } // end output loop - - // Initialize accumulator with input biases - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { - for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - #pragma HLS UNROLL - acc[ii][ff] = biases[ff]; - } - } - -// Accumulate multiplication result -AccumOut: - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { - AccumFilt: - for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - // Do "dot product" sum within filter and sum over channels - AccumChan: - for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { - int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; - acc[ii][ff] += mult[index_mult]; - } // end channel loop - } // end filter loop - } // end output loop - - // Cast to "res_t" type - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { - for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - #pragma HLS UNROLL - res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); - } - } -} - // hls4ml insert code } // namespace nnet diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h index e942a1dc89..c3cf1a2de4 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h @@ -2,6 +2,7 @@ #define NNET_COMMON_H_ #include "ap_fixed.h" +#include "nnet_helpers.h" // This is a substitute for "ceil(n/(float)d)". #define DIV_ROUNDUP(n, d) ((n + d - 1) / d) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 0d9afb10cb..8fb9f769f4 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -84,5 +84,85 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } } +template +void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_width == 1); + + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; + + #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 + #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + #pragma HLS function_instantiate variable=weights,biases + + // Parallel mode + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + #pragma HLS ARRAY_PARTITION variable=weights complete dim=0 + #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 + + // Limit multipliers to control parallelization + int multiplier_limit = + ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / + float(CONFIG_T::reuse_factor)); +#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit + +// Convolve, saving all multiplication results to accumulate later +ConvOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + ConvFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + #pragma HLS UNROLL + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + int index_weight = cc * CONFIG_T::n_filt + ff; + int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + + if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left || + (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { + mult[index_mult] = 0; + } else { + mult[index_mult] = data[index_data] * weights[index_weight]; + } + } // end channel loop + } // end filter loop + } // end output loop + + // Initialize accumulator with input biases + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL + acc[ii][ff] = biases[ff]; + } + } + +// Accumulate multiplication result +AccumOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + AccumFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Do "dot product" sum within filter and sum over channels + AccumChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + acc[ii][ff] += mult[index_mult]; + } // end channel loop + } // end filter loop + } // end output loop + + // Cast to "res_t" type + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL + res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); + } + } +} + } // namespace nnet #endif diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index 412bb8d667..2f7bb676f4 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -591,6 +591,8 @@ def write_build_script(self, model): f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%'))) f.write('variable version\n') f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0'))) + f.write('variable maximum_size\n') + f.write('set maximum_size {}\n'.format(model.config.get_config_value('MaximumSize', '4192'))) f.close() # build_prj.tcl From 1dd2603558f8ceb6d16b449c67e52567650d3eaf Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 11 Oct 2023 20:01:28 -0700 Subject: [PATCH 15/41] default 4096 --- hls4ml/writer/vivado_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index 2f7bb676f4..80c4094a4f 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -592,7 +592,7 @@ def write_build_script(self, model): f.write('variable version\n') f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0'))) f.write('variable maximum_size\n') - f.write('set maximum_size {}\n'.format(model.config.get_config_value('MaximumSize', '4192'))) + f.write('set maximum_size {}\n'.format(model.config.get_config_value('MaximumSize', '4096'))) f.close() # build_prj.tcl From 04997c234ffed74b35ff79074d5c8b9c7788477f Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sat, 14 Oct 2023 21:27:51 -0700 Subject: [PATCH 16/41] only add pointwise function when optimizing --- hls4ml/backends/fpga/passes/im2col_codegen.py | 22 ----- .../vivado/passes/convolution_templates.py | 2 - hls4ml/backends/vivado/passes/pointwise.py | 82 ++++++++++++++++++- .../vivado/nnet_utils/nnet_code_gen.h | 10 +++ 4 files changed, 88 insertions(+), 28 deletions(-) diff --git a/hls4ml/backends/fpga/passes/im2col_codegen.py b/hls4ml/backends/fpga/passes/im2col_codegen.py index 6d7243dd8b..f1f1080996 100644 --- a/hls4ml/backends/fpga/passes/im2col_codegen.py +++ b/hls4ml/backends/fpga/passes/im2col_codegen.py @@ -49,25 +49,3 @@ def _generate_im2col_2d(self, node): ) node.set_attr('line_buffer_codegen', Source(code_str)) - - -class GeneratePointwiseConv1D(OptimizerPass): - '''Generates code for pointwise 1D convolution''' - - def match(self, node): - return isinstance(node, Conv1D) and node.model.config.get_config_value('IOType') == 'io_parallel' - - def transform(self, model, node): - node_class = node.__class__.__name__ - if '1D' in node_class: - self._generate_pointwise_conv1d(node) - else: - raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})') - - def _generate_pointwise_conv1d(self, node): - code_str = node.model.config.backend.generate_pointwise_conv1d_fn( - node.get_attr('index'), - node.get_attr('reuse_factor'), - ) - - node.set_attr('pointwise_conv1d_codegen', Source(code_str)) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index a4fbdd405f..60eddae806 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -56,8 +56,6 @@ typedef {config_t} mult_config; template using scale_index = nnet::{scale_index_type}; - template - using pointwise_conv = nnet::{pointwise_fn}; }}; const ap_uint config{index}::pixels[] = {{{instructions}}};\n""" diff --git a/hls4ml/backends/vivado/passes/pointwise.py b/hls4ml/backends/vivado/passes/pointwise.py index c353a10604..0353787e8c 100644 --- a/hls4ml/backends/vivado/passes/pointwise.py +++ b/hls4ml/backends/vivado/passes/pointwise.py @@ -8,13 +8,87 @@ Conv1DFunctionTemplate, Conv2DConfigTemplate, Conv2DFunctionTemplate, - conv1d_config_template, - conv2d_config_template, conv_mult_config_template, ) from hls4ml.model.layers import register_layer from hls4ml.model.optimizer import OptimizerPass +pointwise_conv1d_config_template = """struct config{index} : nnet::conv1d_config {{ + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const unsigned in_width = {in_width}; + static const unsigned n_chan = {n_chan}; + static const unsigned filt_width = {filt_width}; + static const unsigned kernel_size = filt_width; + static const unsigned n_filt = {n_filt}; + static const unsigned stride_width = {stride_width}; + static const unsigned dilation = {dilation}; + static const unsigned out_width = {out_width}; + static const unsigned reuse_factor = {reuse}; + static const unsigned n_zeros = {nzeros}; + static const bool store_weights_in_bram = false; + static const unsigned strategy = nnet::{strategy}; + static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation}; + static const unsigned min_width = {min_width}; + static const ap_uint pixels[min_width]; + static const unsigned n_partitions = {n_partitions}; + static const unsigned n_pixels = out_width / n_partitions; + template + using fill_buffer = nnet::{fill_fn}; + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {config_t} mult_config; + template + using scale_index = nnet::{scale_index_type}; + template + using pointwise_conv = nnet::{pointwise_fn}; +}}; +const ap_uint config{index}::pixels[] = {{{instructions}}};\n""" + +pointwise_conv2d_config_template = """struct config{index} : nnet::conv2d_config {{ + static const unsigned pad_top = {pad_top}; + static const unsigned pad_bottom = {pad_bottom}; + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned n_chan = {n_chan}; + static const unsigned filt_height = {filt_height}; + static const unsigned filt_width = {filt_width}; + static const unsigned kernel_size = filt_height * filt_width; + static const unsigned n_filt = {n_filt}; + static const unsigned stride_height = {stride_height}; + static const unsigned stride_width = {stride_width}; + static const unsigned out_height = {out_height}; + static const unsigned out_width = {out_width}; + static const unsigned reuse_factor = {reuse}; + static const unsigned n_zeros = {nzeros}; + static const unsigned multiplier_limit = + DIV_ROUNDUP(kernel_size * n_chan * n_filt, reuse_factor) - n_zeros / reuse_factor; + static const bool store_weights_in_bram = false; + static const unsigned strategy = nnet::{strategy}; + static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation}; + static const unsigned min_height = {min_height}; + static const unsigned min_width = {min_width}; + static const ap_uint pixels[min_height * min_width]; + static const unsigned n_partitions = {n_partitions}; + static const unsigned n_pixels = out_height * out_width / n_partitions; + template + using fill_buffer = nnet::{fill_fn}; + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {config_t} mult_config; + template + using scale_index_height = nnet::{scale_index_height_type}; + template + using scale_index_width = nnet::{scale_index_width_type}; + template + using pointwise_conv = nnet::{pointwise_fn}; +}}; +const ap_uint config{index}::pixels[] = {{{instructions}}};\n""" + pointwise_conv1d_function_template = ( 'nnet::pointwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' ) @@ -29,7 +103,7 @@ class PointwiseConv1DConfigTemplate(Conv1DConfigTemplate): def __init__(self): super(Conv1DConfigTemplate, self).__init__(PointwiseConv1D) - self.template = conv1d_config_template + self.template = pointwise_conv1d_config_template self.mult_template = conv_mult_config_template @@ -42,7 +116,7 @@ def __init__(self): class PointwiseConv2DConfigTemplate(Conv2DConfigTemplate): def __init__(self): super(Conv2DConfigTemplate, self).__init__(PointwiseConv2D) - self.template = conv2d_config_template + self.template = pointwise_conv2d_config_template self.mult_template = conv_mult_config_template diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h index 1900aa2716..1e922bbfed 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h @@ -36,6 +36,16 @@ template class PointwiseConv1D { } }; +template class PointwiseConv2D { + public: + static void pointwise_conv(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + // To be implemented in subclasses + } +}; + // hls4ml insert code } // namespace nnet From a181d971b38a09aa4bd0d62e303d43f08474ca0f Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 9 Jun 2024 21:35:40 -0700 Subject: [PATCH 17/41] add vitis --- .../templates/vitis/nnet_utils/nnet_conv1d.h | 14 +++- .../vitis/nnet_utils/nnet_conv1d_latency.h | 80 +++++++++++++++++++ 2 files changed, 92 insertions(+), 2 deletions(-) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h index 52a404672c..1b66c646af 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h @@ -55,9 +55,19 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully. //#pragma HLS INLINE recursive - // Nothing special to be done for io_parallel implementation if (CONFIG_T::strategy == nnet::latency) { - conv_1d_latency_cl(data, res, weights, biases); + if (CONFIG_T::implementation == conv_implementation::pointwise) { + // Use pointwise unrolled implementation + if (CONFIG_T::reuse_factor > 1) { + CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); + } else { + assert(CONFIG_T::reuse_factor == 1); + pointwise_conv_1d_latency_cl(data, res, weights, biases); + } + } else { + // Use standard unrolled implementation + conv_1d_latency_cl(data, res, weights, biases); + } } else { conv_1d_resource_cl(data, res, weights, biases); } diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h index 1bf25cc89c..3fd6160f4f 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h @@ -85,5 +85,85 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } } +template +void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_width == 1); + + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; + + #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 + #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + #pragma HLS function_instantiate variable=weights,biases + + // Parallel mode + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + #pragma HLS ARRAY_PARTITION variable=weights complete dim=0 + #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 + + // Limit multipliers to control parallelization + int multiplier_limit = + ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / + float(CONFIG_T::reuse_factor)); +#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit + +// Convolve, saving all multiplication results to accumulate later +ConvOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + ConvFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + #pragma HLS UNROLL + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + int index_weight = cc * CONFIG_T::n_filt + ff; + int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + + if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left || + (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { + mult[index_mult] = 0; + } else { + mult[index_mult] = data[index_data] * weights[index_weight]; + } + } // end channel loop + } // end filter loop + } // end output loop + + // Initialize accumulator with input biases + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL + acc[ii][ff] = biases[ff]; + } + } + +// Accumulate multiplication result +AccumOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + AccumFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Do "dot product" sum within filter and sum over channels + AccumChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + acc[ii][ff] += mult[index_mult]; + } // end channel loop + } // end filter loop + } // end output loop + + // Cast to "res_t" type + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL + res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); + } + } +} + } // namespace nnet #endif From a6a5c7f9a44848de88c59271f9d3298608c5bc4c Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 9 Oct 2024 18:52:55 -0700 Subject: [PATCH 18/41] add flow --- hls4ml/backends/vivado/vivado_backend.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 982fa2ce87..694cb503fe 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -115,6 +115,7 @@ def _register_flows(self): 'vivado:generate_conv_streaming_instructions', 'vivado:apply_resource_strategy', 'vivado:generate_conv_im2col', + 'vivado:generate_pointwise_conv1_d', ] vivado_types_flow = register_flow('specific_types', vivado_types, requires=[init_flow], backend=self.name) From 170999fae963dba6bf4091a8af60f16b17dfb96a Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Thu, 10 Oct 2024 07:14:46 -0700 Subject: [PATCH 19/41] div roundup --- example-models | 2 +- hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h | 4 +--- hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 4 +--- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/example-models b/example-models index 3cfbcfd062..ff74f73dbc 160000 --- a/example-models +++ b/example-models @@ -1 +1 @@ -Subproject commit 3cfbcfd062f60492507d21ff0e91559b3bdd6550 +Subproject commit ff74f73dbc253d1aa7de1603ee10ede551919548 diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h index 3fd6160f4f..bfe675ce12 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h @@ -107,9 +107,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - int multiplier_limit = - ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / - float(CONFIG_T::reuse_factor)); + constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor); #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit // Convolve, saving all multiplication results to accumulate later diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 8fb9f769f4..6f23976799 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -106,9 +106,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - int multiplier_limit = - ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / - float(CONFIG_T::reuse_factor)); + constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor); #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit // Convolve, saving all multiplication results to accumulate later From 4ec63876dbbd8643f195164f375882e329f27859 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Thu, 17 Oct 2024 13:50:27 -0700 Subject: [PATCH 20/41] update --- hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h | 7 +------ hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h | 7 +------ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h index 1b66c646af..1c268ed588 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h @@ -58,12 +58,7 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], if (CONFIG_T::strategy == nnet::latency) { if (CONFIG_T::implementation == conv_implementation::pointwise) { // Use pointwise unrolled implementation - if (CONFIG_T::reuse_factor > 1) { - CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); - } else { - assert(CONFIG_T::reuse_factor == 1); - pointwise_conv_1d_latency_cl(data, res, weights, biases); - } + CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); } else { // Use standard unrolled implementation conv_1d_latency_cl(data, res, weights, biases); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index 7cceabfe1b..95d5d7fcce 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -56,12 +56,7 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], if (CONFIG_T::strategy == nnet::latency) { if (CONFIG_T::implementation == conv_implementation::pointwise) { // Use pointwise unrolled implementation - if (CONFIG_T::reuse_factor > 1) { - CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); - } else { - assert(CONFIG_T::reuse_factor == 1); - pointwise_conv_1d_latency_cl(data, res, weights, biases); - } + CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); } else { // Use standard unrolled implementation conv_1d_latency_cl(data, res, weights, biases); From 6ca2f1b381072e86eb3b5949315ec8e5a0b2a92a Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Mon, 28 Oct 2024 20:25:38 -0700 Subject: [PATCH 21/41] roundup --- hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h | 3 ++- hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h index bfe675ce12..9102a038fd 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h @@ -107,7 +107,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor); + constexpr unsigned multiplier_limit = DIV_ROUNDUP( + CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor); #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit // Convolve, saving all multiplication results to accumulate later diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 6f23976799..2692f2912c 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -106,7 +106,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor); + constexpr unsigned multiplier_limit = DIV_ROUNDUP( + CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor); #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit // Convolve, saving all multiplication results to accumulate later From 352772d73386e915f94c756b761da9dc0154566f Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Mon, 28 Oct 2024 20:30:11 -0700 Subject: [PATCH 22/41] restore example-models --- example-models | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example-models b/example-models index ff74f73dbc..3cfbcfd062 160000 --- a/example-models +++ b/example-models @@ -1 +1 @@ -Subproject commit ff74f73dbc253d1aa7de1603ee10ede551919548 +Subproject commit 3cfbcfd062f60492507d21ff0e91559b3bdd6550 From d37a843e77491416ff9d6cf9640cd13115324149 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Thu, 31 Oct 2024 21:17:23 -0700 Subject: [PATCH 23/41] remove pointwise conv implementation option; make it default --- hls4ml/backends/vivado/passes/pointwise.py | 46 +------------------ hls4ml/backends/vivado/vivado_backend.py | 4 +- .../vivado/nnet_utils/nnet_code_gen.h | 10 ---- .../templates/vivado/nnet_utils/nnet_common.h | 2 - .../templates/vivado/nnet_utils/nnet_conv1d.h | 9 +--- .../vivado/nnet_utils/nnet_conv_stream.h | 4 +- test/pytest/test_pointwiseconv.py | 41 ++++++++--------- 7 files changed, 27 insertions(+), 89 deletions(-) diff --git a/hls4ml/backends/vivado/passes/pointwise.py b/hls4ml/backends/vivado/passes/pointwise.py index c671f1d670..79a72c1e6a 100644 --- a/hls4ml/backends/vivado/passes/pointwise.py +++ b/hls4ml/backends/vivado/passes/pointwise.py @@ -4,6 +4,7 @@ Conv1DFunctionTemplate, Conv2DConfigTemplate, Conv2DFunctionTemplate, + conv2d_config_template, conv_mult_config_template, ) from hls4ml.model.layers import register_layer @@ -42,49 +43,6 @@ }}; const ap_uint config{index}::pixels[] = {{{instructions}}};\n""" -pointwise_conv2d_config_template = """struct config{index} : nnet::conv2d_config {{ - static const unsigned pad_top = {pad_top}; - static const unsigned pad_bottom = {pad_bottom}; - static const unsigned pad_left = {pad_left}; - static const unsigned pad_right = {pad_right}; - static const unsigned in_height = {in_height}; - static const unsigned in_width = {in_width}; - static const unsigned n_chan = {n_chan}; - static const unsigned filt_height = {filt_height}; - static const unsigned filt_width = {filt_width}; - static const unsigned kernel_size = filt_height * filt_width; - static const unsigned n_filt = {n_filt}; - static const unsigned stride_height = {stride_height}; - static const unsigned stride_width = {stride_width}; - static const unsigned out_height = {out_height}; - static const unsigned out_width = {out_width}; - static const unsigned reuse_factor = {reuse}; - static const unsigned n_zeros = {nzeros}; - static const unsigned multiplier_limit = - DIV_ROUNDUP(kernel_size * n_chan * n_filt, reuse_factor) - n_zeros / reuse_factor; - static const bool store_weights_in_bram = false; - static const unsigned strategy = nnet::{strategy}; - static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation}; - static const unsigned min_height = {min_height}; - static const unsigned min_width = {min_width}; - static const ap_uint pixels[min_height * min_width]; - static const unsigned n_partitions = {n_partitions}; - static const unsigned n_pixels = out_height * out_width / n_partitions; - template - using fill_buffer = nnet::{fill_fn}; - typedef {accum_t.name} accum_t; - typedef {bias_t.name} bias_t; - typedef {weight_t.name} weight_t; - typedef {config_t} mult_config; - template - using scale_index_height = nnet::{scale_index_height_type}; - template - using scale_index_width = nnet::{scale_index_width_type}; - template - using pointwise_conv = nnet::{pointwise_fn}; -}}; -const ap_uint config{index}::pixels[] = {{{instructions}}};\n""" - pointwise_conv1d_function_template = ( 'nnet::pointwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' ) @@ -112,7 +70,7 @@ def __init__(self): class PointwiseConv2DConfigTemplate(Conv2DConfigTemplate): def __init__(self): super(Conv2DConfigTemplate, self).__init__(PointwiseConv2D) - self.template = pointwise_conv2d_config_template + self.template = conv2d_config_template self.mult_template = conv_mult_config_template diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index bea9b9ab35..8df4b86364 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -70,9 +70,7 @@ def _register_layer_attributes(self): cnn_layers = [Conv1D, Conv2D, SeparableConv1D, SeparableConv2D, DepthwiseConv2D, Pooling1D, Pooling2D] for layer in cnn_layers: attrs = self.attribute_map.get(layer, []) - attrs.append( - ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer') - ) + attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer')) self.attribute_map[layer] = attrs def _register_flows(self): diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h index 11152338e6..6011e20cca 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h @@ -21,16 +21,6 @@ template class PointwiseConv1D { } }; -template class PointwiseConv2D { - public: - static void pointwise_conv(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], - typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], - typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - // To be implemented in subclasses - } -}; - // hls4ml insert code } // namespace nnet diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h index 8ce3d836fa..a14517df5b 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h @@ -2,7 +2,6 @@ #define NNET_COMMON_H_ #include "ap_fixed.h" -#include "nnet_helpers.h" // This is a substitute for "ceil(n/(float)d)". #define DIV_ROUNDUP(n, d) ((n + d - 1) / d) @@ -25,7 +24,6 @@ namespace nnet { // Common type definitions enum io_type { io_parallel = 0, io_stream }; enum strategy { latency, resource, resource_unrolled }; -enum class conv_implementation { linebuffer = 0, encoded = 1, pointwise = 2 }; /* --- * Balanced tree reduce implementation. diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index 95d5d7fcce..f0f1c133b9 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -54,13 +54,8 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], #pragma HLS INLINE region if (CONFIG_T::strategy == nnet::latency) { - if (CONFIG_T::implementation == conv_implementation::pointwise) { - // Use pointwise unrolled implementation - CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); - } else { - // Use standard unrolled implementation - conv_1d_latency_cl(data, res, weights, biases); - } + // Use pointwise unrolled implementation + CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); } else { conv_1d_resource_cl(data, res, weights, biases); } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h index 4189fb3a09..0caa435717 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h @@ -8,6 +8,8 @@ namespace nnet { +enum class conv_implementation { linebuffer = 0, encoded = 1 }; + // ************************************************* // Encoded Implementation (Vlad's) // ************************************************* @@ -56,7 +58,7 @@ template unsigned scale_index_K_lt_S(const template class scale_index_regular { public: static unsigned scale_index(const unsigned idx) { - #pragma HLS INLINE +#pragma HLS INLINE if (K >= S) { return scale_index_K_gte_S(idx); diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index 1bd9a73b01..2890b0ab11 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -19,29 +19,27 @@ @pytest.mark.parametrize('padds', padds_options) @pytest.mark.parametrize('strides', strides1d_options) @pytest.mark.parametrize( - 'backend, io_type, strategy, conv_impl, rf', + 'backend, io_type, strategy, rf', [ - ('Quartus', 'io_parallel', 'resource', 'LineBuffer', 1), - ('Quartus', 'io_stream', 'resource', 'LineBuffer', 1), - ('oneAPI', 'io_parallel', 'resource', 'LineBuffer', 1), - ('oneAPI', 'io_stream', 'resource', 'LineBuffer', 1), - ('Vivado', 'io_parallel', 'resource', 'LineBuffer', 1), - ('Vitis', 'io_parallel', 'resource', 'LineBuffer', 1), - ('Vivado', 'io_parallel', 'latency', 'LineBuffer', 1), - ('Vitis', 'io_parallel', 'latency', 'LineBuffer', 1), - ('Vivado', 'io_parallel', 'latency', 'Pointwise', 1), - ('Vivado', 'io_parallel', 'latency', 'Pointwise', 14), - ('Vitis', 'io_parallel', 'latency', 'Pointwise', 1), - ('Vitis', 'io_parallel', 'latency', 'Pointwise', 14), - ('Vivado', 'io_stream', 'latency', 'LineBuffer', 1), - ('Vivado', 'io_stream', 'resource', 'LineBuffer', 1), - ('Vitis', 'io_stream', 'latency', 'LineBuffer', 1), - ('Vitis', 'io_stream', 'resource', 'LineBuffer', 1), - ('Catapult', 'io_stream', 'latency', 'LineBuffer', 1), - ('Catapult', 'io_stream', 'resource', 'LineBuffer', 1), + ('Quartus', 'io_parallel', 'resource', 1), + ('Quartus', 'io_stream', 'resource', 1), + ('oneAPI', 'io_parallel', 'resource', 1), + ('oneAPI', 'io_stream', 'resource', 1), + ('Vivado', 'io_parallel', 'resource', 1), + ('Vitis', 'io_parallel', 'resource', 1), + ('Vivado', 'io_parallel', 'latency', 1), + ('Vitis', 'io_parallel', 'latency', 1), + ('Vivado', 'io_parallel', 'latency', 14), + ('Vitis', 'io_parallel', 'latency', 14), + ('Vivado', 'io_stream', 'latency', 1), + ('Vivado', 'io_stream', 'resource', 1), + ('Vitis', 'io_stream', 'latency', 1), + ('Vitis', 'io_stream', 'resource', 1), + ('Catapult', 'io_stream', 'latency', 1), + ('Catapult', 'io_stream', 'resource', 1), ], ) -def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_impl, rf): +def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, rf): model = tf.keras.models.Sequential() input_shape = (28, 3) model.add( @@ -65,12 +63,11 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv default_precision = 'fixed<32,16>' config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision, granularity='name') config['Model']['Strategy'] = strategy - config['LayerName']['pointwise1d']['ConvImplementation'] = conv_impl config['LayerName']['pointwise1d']['ReuseFactor'] = rf output_dir = str( test_root_path - / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{backend}_{io_type}_{strategy}_{conv_impl}_rf{rf}' + / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{backend}_{io_type}_{strategy}_rf{rf}' ) hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend From f5629db5c55cb8b2944d4276d8a232178527ddd1 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Thu, 31 Oct 2024 21:19:45 -0700 Subject: [PATCH 24/41] remove pointwise conv implementation option; make it default --- hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h index 1c268ed588..92b8571d88 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h @@ -56,13 +56,8 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], //#pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { - if (CONFIG_T::implementation == conv_implementation::pointwise) { - // Use pointwise unrolled implementation - CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); - } else { - // Use standard unrolled implementation - conv_1d_latency_cl(data, res, weights, biases); - } + // Use pointwise unrolled implementation + CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); } else { conv_1d_resource_cl(data, res, weights, biases); } From f4ae08f9c2cbd561596ee34888591a5696bd18d6 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Thu, 31 Oct 2024 21:21:13 -0700 Subject: [PATCH 25/41] Restore tab --- hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h index 0caa435717..dcd914dffe 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h @@ -58,7 +58,7 @@ template unsigned scale_index_K_lt_S(const template class scale_index_regular { public: static unsigned scale_index(const unsigned idx) { -#pragma HLS INLINE + #pragma HLS INLINE if (K >= S) { return scale_index_K_gte_S(idx); From ecd6b04aeee1ad341b9bfa939405571776664f6f Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Thu, 31 Oct 2024 21:27:57 -0700 Subject: [PATCH 26/41] Add back nnet_helpers.h --- hls4ml/templates/vivado/nnet_utils/nnet_common.h | 1 + 1 file changed, 1 insertion(+) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h index a14517df5b..6db3f62f6e 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h @@ -2,6 +2,7 @@ #define NNET_COMMON_H_ #include "ap_fixed.h" +#include "nnet_helpers.h" // This is a substitute for "ceil(n/(float)d)". #define DIV_ROUNDUP(n, d) ((n + d - 1) / d) From 6f5cbd98c52e541cf2f3609736cf25c0c0f0a34c Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Thu, 31 Oct 2024 21:29:56 -0700 Subject: [PATCH 27/41] format --- test/pytest/test_pointwiseconv.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index 2890b0ab11..1cfb43e4cd 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -27,8 +27,8 @@ ('oneAPI', 'io_stream', 'resource', 1), ('Vivado', 'io_parallel', 'resource', 1), ('Vitis', 'io_parallel', 'resource', 1), - ('Vivado', 'io_parallel', 'latency', 1), - ('Vitis', 'io_parallel', 'latency', 1), + ('Vivado', 'io_parallel', 'latency', 1), + ('Vitis', 'io_parallel', 'latency', 1), ('Vivado', 'io_parallel', 'latency', 14), ('Vitis', 'io_parallel', 'latency', 14), ('Vivado', 'io_stream', 'latency', 1), @@ -66,8 +66,7 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, rf): config['LayerName']['pointwise1d']['ReuseFactor'] = rf output_dir = str( - test_root_path - / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{backend}_{io_type}_{strategy}_rf{rf}' + test_root_path / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{backend}_{io_type}_{strategy}_rf{rf}' ) hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend From 8ebeefec338bc9aa8380a8e74fd611731d7c7c19 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Tue, 12 Nov 2024 20:37:58 -0800 Subject: [PATCH 28/41] jovan comments --- .../vivado/passes/pointwise_codegen.py | 61 ++++++++++++++++++- hls4ml/backends/vivado/vivado_backend.py | 58 ------------------ .../vitis/nnet_utils/nnet_conv1d_latency.h | 2 +- .../vivado/nnet_utils/nnet_conv1d_latency.h | 2 +- 4 files changed, 62 insertions(+), 61 deletions(-) diff --git a/hls4ml/backends/vivado/passes/pointwise_codegen.py b/hls4ml/backends/vivado/passes/pointwise_codegen.py index f459d59208..cb26fb6530 100644 --- a/hls4ml/backends/vivado/passes/pointwise_codegen.py +++ b/hls4ml/backends/vivado/passes/pointwise_codegen.py @@ -3,6 +3,65 @@ from hls4ml.model.types import Source +def generate_pointwise_conv1d_fn(layer_idx, reuse_factor=1): + """Generate a C++ function for a pointwise convolution layer. + + Args: + layer_idx (int): Index of layer ('index' attribute). + reuse_factor (int): Number of partitions to divide the input into. + + Returns: + str: Generated C++ function + """ + + generated_code = ( + "template\n" + "class pointwise_conv_{index} : public PointwiseConv1D {{\n" + " public:\n" + " static void pointwise_conv(\n" + " data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n" + " res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n" + " typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n" + " typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n" + " data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n" # noqa: E501 + " #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n" + " res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n" # noqa: E501 + " #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n" + " RFInputLoop:\n" + " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n" + " #pragma HLS UNROLL\n" + " InnerInputLoop:\n" + " for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n" + " #pragma HLS UNROLL\n" + " data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n" # noqa: E501 + " }}\n" + " }}\n\n" + ).format(index=layer_idx) + indent = " " + for i in range(reuse_factor): + generated_code += indent + generated_code += ( + f"pointwise_conv_1d_latency_cl(data_tmp[{i}], res_tmp[{i}], weights, biases);\n" + ) + + generated_code += ( + "\n" + " RFOutputLoop:\n" + " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n" + " #pragma HLS UNROLL\n" + " InnerOutputLoop:\n" + " for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n" + " #pragma HLS UNROLL\n" + " res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n" # noqa: E501 + " }\n" + " }\n" + " }\n" + "};\n" + ) + + return generated_code + + class GeneratePointwiseConv1D(OptimizerPass): '''Generates code for pointwise 1D convolution''' @@ -17,7 +76,7 @@ def transform(self, model, node): raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})') def _generate_pointwise_conv1d(self, node): - code_str = node.model.config.backend.generate_pointwise_conv1d_fn( + code_str = generate_pointwise_conv1d_fn( node.get_attr('index'), node.get_attr('reuse_factor'), ) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 8df4b86364..02d3ba17bb 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -649,61 +649,3 @@ def init_garnet(self, layer): @layer_optimizer(GarNetStack) def init_garnet_stack(self, layer): self.init_garnet(layer) - - def generate_pointwise_conv1d_fn(self, layer_idx, reuse_factor=1): - """Generate a C++ function for a pointwise convolution layer. - - Args: - layer_idx (int): Index of layer ('index' attribute). - reuse_factor (int): Number of partitions to divide the input into. - - Returns: - str: Generated C++ function - """ - - generated_code = ( - "template\n" - "class pointwise_conv_{index} : public PointwiseConv1D {{\n" - " public:\n" - " static void pointwise_conv(\n" - " data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n" - " res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n" - " typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n" - " typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n" - " data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n" # noqa: E501 - " #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n" - " res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n" # noqa: E501 - " #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n" - " RFInputLoop:\n" - " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n" - " #pragma HLS UNROLL\n" - " InnerInputLoop:\n" - " for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n" - " #pragma HLS UNROLL\n" - " data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n" # noqa: E501 - " }}\n" - " }}\n\n" - ).format(index=layer_idx) - indent = " " - for i in range(reuse_factor): - generated_code += indent - generated_code += ( - f"pointwise_conv_1d_latency_cl(data_tmp[{i}], res_tmp[{i}], weights, biases);\n" - ) - - generated_code += ( - "\n" - " RFOutputLoop:\n" - " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n" - " #pragma HLS UNROLL\n" - " InnerOutputLoop:\n" - " for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n" - " #pragma HLS UNROLL\n" - " res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n" # noqa: E501 - " }\n" - " }\n" - " }\n" - "};\n" - ) - - return generated_code diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h index 9102a038fd..f93fd0b269 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h @@ -127,7 +127,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { mult[index_mult] = 0; } else { - mult[index_mult] = data[index_data] * weights[index_weight]; + mult[index_mult] = CONFIG_T::template product::product(data[index_data], weights[index_weight]); } } // end channel loop } // end filter loop diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 2692f2912c..5c7ab470de 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -126,7 +126,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { mult[index_mult] = 0; } else { - mult[index_mult] = data[index_data] * weights[index_weight]; + mult[index_mult] = CONFIG_T::template product::product(data[index_data], weights[index_weight]); } } // end channel loop } // end filter loop From 4099c8dd1fe06a1cf39a909d4173af729c516d10 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Tue, 12 Nov 2024 21:09:31 -0800 Subject: [PATCH 29/41] p-clang-format --- .../templates/vitis/nnet_utils/nnet_conv1d_latency.h | 11 ++++++----- .../templates/vivado/nnet_utils/nnet_conv1d_latency.h | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h index f93fd0b269..49e6ae3505 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h @@ -127,11 +127,12 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { mult[index_mult] = 0; } else { - mult[index_mult] = CONFIG_T::template product::product(data[index_data], weights[index_weight]); + mult[index_mult] = CONFIG_T::template product::product( + data[index_data], weights[index_weight]); } } // end channel loop - } // end filter loop - } // end output loop + } // end filter loop + } // end output loop // Initialize accumulator with input biases for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { @@ -152,8 +153,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; acc[ii][ff] += mult[index_mult]; } // end channel loop - } // end filter loop - } // end output loop + } // end filter loop + } // end output loop // Cast to "res_t" type for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 5c7ab470de..cac2d29f1b 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -126,11 +126,12 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { mult[index_mult] = 0; } else { - mult[index_mult] = CONFIG_T::template product::product(data[index_data], weights[index_weight]); + mult[index_mult] = CONFIG_T::template product::product( + data[index_data], weights[index_weight]); } } // end channel loop - } // end filter loop - } // end output loop + } // end filter loop + } // end output loop // Initialize accumulator with input biases for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { @@ -151,8 +152,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; acc[ii][ff] += mult[index_mult]; } // end channel loop - } // end filter loop - } // end output loop + } // end filter loop + } // end output loop // Cast to "res_t" type for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { From d999ad8553e37f93ced03bd398e80dc0e1849891 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Tue, 12 Nov 2024 21:21:46 -0800 Subject: [PATCH 30/41] p-clang-format --- hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h | 8 ++++---- hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h index 49e6ae3505..e32e82135f 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h @@ -131,8 +131,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c data[index_data], weights[index_weight]); } } // end channel loop - } // end filter loop - } // end output loop + } // end filter loop + } // end output loop // Initialize accumulator with input biases for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { @@ -153,8 +153,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; acc[ii][ff] += mult[index_mult]; } // end channel loop - } // end filter loop - } // end output loop + } // end filter loop + } // end output loop // Cast to "res_t" type for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index cac2d29f1b..51409dd102 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -130,8 +130,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c data[index_data], weights[index_weight]); } } // end channel loop - } // end filter loop - } // end output loop + } // end filter loop + } // end output loop // Initialize accumulator with input biases for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { @@ -152,8 +152,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; acc[ii][ff] += mult[index_mult]; } // end channel loop - } // end filter loop - } // end output loop + } // end filter loop + } // end output loop // Cast to "res_t" type for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { From 5e5b81fbfd23a5e93e5d72f8b3776a8305dd40ea Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Wed, 13 Nov 2024 20:59:57 +0100 Subject: [PATCH 31/41] Introduce optional description to layer attributes --- docs/attr_doc_gen.py | 99 ++++++++++++++++++++ hls4ml/backends/catapult/catapult_backend.py | 23 +++-- hls4ml/backends/fpga/fpga_backend.py | 24 +++-- hls4ml/backends/oneapi/oneapi_backend.py | 9 +- hls4ml/backends/quartus/quartus_backend.py | 7 +- hls4ml/backends/vivado/vivado_backend.py | 23 +++-- hls4ml/model/attributes.py | 26 ++--- hls4ml/model/layers.py | 7 +- hls4ml/utils/attribute_descriptions.py | 46 +++++++++ 9 files changed, 222 insertions(+), 42 deletions(-) create mode 100644 docs/attr_doc_gen.py create mode 100644 hls4ml/utils/attribute_descriptions.py diff --git a/docs/attr_doc_gen.py b/docs/attr_doc_gen.py new file mode 100644 index 0000000000..8bade86833 --- /dev/null +++ b/docs/attr_doc_gen.py @@ -0,0 +1,99 @@ +import numbers + +import hls4ml.backends as backends +import hls4ml.model.attributes as attributes +import hls4ml.model.layers as layers + +all_backends = backends.get_available_backends() +# Removing duplicates but preserving order +all_layers = list(dict.fromkeys(layers.layer_map.values())) + + +class AttrList: + def __init__(self, cls_name, cls_attrs) -> None: + self.cls_name = cls_name + self.config_attrs = [attr for attr in cls_attrs if attr.configurable == True] + self.type_attrs = [attr for attr in cls_attrs if attr.__class__.__name__ == 'TypeAttribute'] + self.weight_attrs = [attr for attr in cls_attrs if attr.__class__.__name__ == 'WeightAttribute'] + self.base_attrs = [attr for attr in cls_attrs if attr not in self.config_attrs + self.type_attrs + self.weight_attrs] + self.backend_attrs = {} + + def add_backend_attrs(self, backend_name, backend_attrs): + self.backend_attrs[backend_name] = backend_attrs + + +attr_map = [] + +for layer_cls in all_layers: + base_attrs = layer_cls.expected_attributes + + attr_list = AttrList(layer_cls.__name__, base_attrs) + + for backend_name in all_backends: + backend = backends.get_backend(backend_name) + + backend_cls = backend.create_layer_class(layer_cls) + backend_attrs = backend_cls.expected_attributes + + diff_atts = [attr for attr in backend_attrs if attr not in base_attrs] # Sets are faster, but don't preserve order + if len(diff_atts) > 0: + attr_list.add_backend_attrs(backend.name, diff_atts) + + attr_map.append(attr_list) + + +def print_attrs(attrs, file): + for attr in attrs: + if attr.value_type == numbers.Integral: + vtype = 'int' + elif attr.__class__ == attributes.ChoiceAttribute: + choices = ','.join([str(c) for c in attr.choices]) + vtype = f'list [{choices}]' + else: + vtype = attr.value_type.__name__ if hasattr(attr.value_type, '__name__') else str(attr.value_type) + + if attr.default is None: + file.write('* ' + attr.name + ': ' + vtype + '\n\n') + else: + file.write('* ' + attr.name + ': ' + vtype + ' (Default: ' + str(attr.default) + ')\n\n') + + if attr.description is not None: + file.write(' * ' + attr.description + '\n\n') + + +with open('attributes.rst', mode='w') as file: + file.write('================\n') + file.write('Layer attributes\n') + file.write('================\n\n\n') + + for attr_list in attr_map: + file.write(attr_list.cls_name + '\n') + file.write('=' * len(attr_list.cls_name) + '\n') + + if len(attr_list.base_attrs) > 0: + file.write('Base attributes\n') + file.write('---------------\n') + print_attrs(attr_list.type_attrs, file) + + if len(attr_list.type_attrs) > 0: + file.write('Type attributes\n') + file.write('---------------\n') + print_attrs(attr_list.base_attrs, file) + + if len(attr_list.weight_attrs) > 0: + file.write('Weight attributes\n') + file.write('-----------------\n') + print_attrs(attr_list.weight_attrs, file) + + if len(attr_list.config_attrs) > 0: + file.write('Configurable attributes\n') + file.write('-----------------------\n') + print_attrs(attr_list.config_attrs, file) + + if len(attr_list.backend_attrs) > 0: + file.write('Backend attributes\n') + file.write('-----------------------\n') + for backend, backend_attrs in attr_list.backend_attrs.items(): + file.write(backend + '\n') + file.write('^' * len(backend) + '\n') + print_attrs(backend_attrs, file) diff --git a/hls4ml/backends/catapult/catapult_backend.py b/hls4ml/backends/catapult/catapult_backend.py index 5c85bf9b7e..28e13edf37 100644 --- a/hls4ml/backends/catapult/catapult_backend.py +++ b/hls4ml/backends/catapult/catapult_backend.py @@ -32,6 +32,7 @@ from hls4ml.model.optimizer import get_backend_passes, layer_optimizer from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, PackedType from hls4ml.report import parse_catapult_report +from hls4ml.utils import attribute_descriptions as descriptions from hls4ml.utils.fixed_point_utils import ceil_log2 @@ -51,10 +52,12 @@ def _register_layer_attributes(self): for layer in rnn_layers: attrs = self.attribute_map.get(layer, []) - attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1)) - attrs.append(ConfigurableAttribute('static', value_type=bool, default=True)) - attrs.append(ConfigurableAttribute('table_size', default=1024)) - attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8))) + attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1, description=descriptions.reuse_factor)) + attrs.append( + ConfigurableAttribute('static', value_type=bool, default=True, description=descriptions.recurrent_static) + ) + attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size)) + attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type)) self.attribute_map[layer] = attrs # Add ParallelizationFactor to Conv1D/2D @@ -65,7 +68,7 @@ def _register_layer_attributes(self): for layer in pf_layers: attrs = self.attribute_map.get(layer, []) - attrs.append(ConfigurableAttribute('parallelization_factor', default=1)) + attrs.append(ConfigurableAttribute('parallelization_factor', default=1, description=descriptions.conv_pf)) self.attribute_map[layer] = attrs # Add ConvImplementation to Convolution+Pooling layers @@ -73,8 +76,14 @@ def _register_layer_attributes(self): for layer in cnn_layers: attrs = self.attribute_map.get(layer, []) - # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer')) - attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer')) + attrs.append( + ChoiceAttribute( + 'conv_implementation', + choices=['LineBuffer', 'Encoded'], + default='LineBuffer', + description=descriptions.conv_implementation, + ) + ) self.attribute_map[layer] = attrs sep_conv_layers = [SeparableConv1D, SeparableConv2D] diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index a9fc09b7aa..fbfed71c5b 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -45,6 +45,7 @@ UnspecifiedPrecisionType, XnorPrecisionType, ) +from hls4ml.utils import attribute_descriptions as descriptions from hls4ml.writer import get_writer @@ -74,7 +75,7 @@ def __init__(self, name): for layer in accum_layers: attrs = self.attribute_map.get(layer, []) - attrs.append(TypeAttribute('accum')) + attrs.append(TypeAttribute('accum', description=descriptions.accum_type)) self.attribute_map[layer] = attrs rf_layers = accum_layers + [ @@ -90,7 +91,7 @@ def __init__(self, name): for layer in rf_layers: attrs = self.attribute_map.get(layer, []) - attrs.append(ConfigurableAttribute('reuse_factor', default=1)) + attrs.append(ConfigurableAttribute('reuse_factor', default=1, description=descriptions.reuse_factor)) self.attribute_map[layer] = attrs # seperable is kind of special because it is effectively two layers that will be split @@ -104,23 +105,34 @@ def __init__(self, name): self.attribute_map[layer] = attrs act_attrs = self.attribute_map.get(Activation, []) - act_attrs.append(ConfigurableAttribute('table_size', default=1024)) - act_attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8))) + act_attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size)) + act_attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type)) self.attribute_map[Activation] = act_attrs softmax_attrs = self.attribute_map.get(Softmax, []) - softmax_attrs.append(ChoiceAttribute('implementation', ['latency', 'stable', 'argmax', 'legacy'], default='stable')) - softmax_attrs.append(ConfigurableAttribute('skip', value_type=bool, default=False)) + softmax_attrs.append( + ChoiceAttribute( + 'implementation', + ['latency', 'stable', 'argmax', 'legacy'], + default='stable', + description=descriptions.softmax_implementation, + ) + ) + softmax_attrs.append( + ConfigurableAttribute('skip', value_type=bool, default=False, description=descriptions.softmax_skip) + ) softmax_attrs.append( TypeAttribute( 'exp_table', default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT), + description=descriptions.table_type, ) ) softmax_attrs.append( TypeAttribute( 'inv_table', default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT), + description=descriptions.table_type, ) ) self.attribute_map[Softmax] = softmax_attrs diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py index c85a8c0e94..7d0f0d48e2 100644 --- a/hls4ml/backends/oneapi/oneapi_backend.py +++ b/hls4ml/backends/oneapi/oneapi_backend.py @@ -10,6 +10,7 @@ from hls4ml.model.layers import GRU, LSTM, Activation, Conv1D, Conv2D, Dense, Embedding, Layer, SimpleRNN, Softmax from hls4ml.model.optimizer import get_backend_passes, layer_optimizer from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType +from hls4ml.utils import attribute_descriptions as descriptions # from hls4ml.report import parse_oneapi_report @@ -30,9 +31,9 @@ def _register_layer_attributes(self): for layer in rnn_layers: attrs = self.attribute_map.get(layer, []) - attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1)) - attrs.append(ConfigurableAttribute('table_size', default=1024)) - attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8))) + attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1, description=descriptions.reuse_factor)) + attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size)) + attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type)) self.attribute_map[layer] = attrs # Add ParallelizationFactor to Conv1D/2D @@ -43,7 +44,7 @@ def _register_layer_attributes(self): for layer in pf_layers: attrs = self.attribute_map.get(layer, []) - attrs.append(ConfigurableAttribute('parallelization_factor', default=1)) + attrs.append(ConfigurableAttribute('parallelization_factor', default=1, description=descriptions.conv_pf)) self.attribute_map[layer] = attrs def _register_flows(self): diff --git a/hls4ml/backends/quartus/quartus_backend.py b/hls4ml/backends/quartus/quartus_backend.py index aecad642c6..e56e1e05a6 100644 --- a/hls4ml/backends/quartus/quartus_backend.py +++ b/hls4ml/backends/quartus/quartus_backend.py @@ -11,6 +11,7 @@ from hls4ml.model.optimizer import get_backend_passes, layer_optimizer from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType from hls4ml.report import parse_quartus_report +from hls4ml.utils import attribute_descriptions as descriptions @contextmanager @@ -39,9 +40,9 @@ def _register_layer_attributes(self): for layer in rnn_layers: attrs = self.attribute_map.get(layer, []) - attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1)) - attrs.append(ConfigurableAttribute('table_size', default=1024)) - attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8))) + attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1, description=descriptions.reuse_factor)) + attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size)) + attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type)) self.attribute_map[layer] = attrs def _register_flows(self): diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 9f8a5171d3..96c13f4b37 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -31,6 +31,7 @@ from hls4ml.model.optimizer import get_backend_passes, layer_optimizer from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, PackedType from hls4ml.report import parse_vivado_report +from hls4ml.utils import attribute_descriptions as descriptions class VivadoBackend(FPGABackend): @@ -49,10 +50,12 @@ def _register_layer_attributes(self): for layer in rnn_layers: attrs = self.attribute_map.get(layer, []) - attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1)) - attrs.append(ConfigurableAttribute('static', value_type=bool, default=True)) - attrs.append(ConfigurableAttribute('table_size', default=1024)) - attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8))) + attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1, description=descriptions.reuse_factor)) + attrs.append( + ConfigurableAttribute('static', value_type=bool, default=True, description=descriptions.recurrent_static) + ) + attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size)) + attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type)) self.attribute_map[layer] = attrs # Add ParallelizationFactor to Conv1D/2D @@ -63,15 +66,21 @@ def _register_layer_attributes(self): for layer in pf_layers: attrs = self.attribute_map.get(layer, []) - attrs.append(ConfigurableAttribute('parallelization_factor', default=1)) + attrs.append(ConfigurableAttribute('parallelization_factor', default=1, description=descriptions.conv_pf)) self.attribute_map[layer] = attrs # Add ConvImplementation to Convolution+Pooling layers cnn_layers = [Conv1D, Conv2D, SeparableConv1D, SeparableConv2D, DepthwiseConv2D, Pooling1D, Pooling2D] for layer in cnn_layers: attrs = self.attribute_map.get(layer, []) - # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer')) - attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer')) + attrs.append( + ChoiceAttribute( + 'conv_implementation', + choices=['LineBuffer', 'Encoded'], + default='LineBuffer', + description=descriptions.conv_implementation, + ) + ) self.attribute_map[layer] = attrs def _register_flows(self): diff --git a/hls4ml/model/attributes.py b/hls4ml/model/attributes.py index 0e8df6e10a..d4aef63409 100644 --- a/hls4ml/model/attributes.py +++ b/hls4ml/model/attributes.py @@ -36,11 +36,12 @@ class Attribute: """ - def __init__(self, name, value_type=Integral, default=None, configurable=False): + def __init__(self, name, value_type=Integral, default=None, configurable=False, description=None): self.name = name self.value_type = value_type self.default = default self.configurable = configurable + self.description = description def validate_value(self, value): if self.value_type is not None: @@ -68,8 +69,8 @@ class ConfigurableAttribute(Attribute): when defining the expected attributes of layer classes. """ - def __init__(self, name, value_type=int, default=None): - super().__init__(name, value_type, default, configurable=True) + def __init__(self, name, value_type=int, default=None, description=None): + super().__init__(name, value_type, default, configurable=True, description=description) class TypeAttribute(Attribute): @@ -79,10 +80,10 @@ class TypeAttribute(Attribute): As a convention, the name of the attribute storing a type will end in ``_t``. """ - def __init__(self, name, default=None, configurable=True): + def __init__(self, name, default=None, configurable=True, description=None): if not name.endswith('_t'): name += '_t' - super().__init__(name, value_type=NamedType, default=default, configurable=configurable) + super().__init__(name, value_type=NamedType, default=default, configurable=configurable, description=description) class ChoiceAttribute(Attribute): @@ -90,13 +91,12 @@ class ChoiceAttribute(Attribute): Represents an attribute whose value can be one of several predefined values. """ - def __init__(self, name, choices, default=None, configurable=True): - super().__init__(name, value_type=list, default=default, configurable=configurable) + def __init__(self, name, choices, default=None, configurable=True, description=None): + super().__init__(name, value_type=list, default=default, configurable=configurable, description=description) assert len(choices) > 0 if default is not None: assert default in choices self.choices = choices - self.value_type = str(self.choices) def validate_value(self, value): return value in self.choices @@ -107,8 +107,8 @@ class WeightAttribute(Attribute): Represents an attribute that will store a weight variable. """ - def __init__(self, name): - super().__init__(name, value_type=WeightVariable, default=None, configurable=False) + def __init__(self, name, description=None): + super().__init__(name, value_type=WeightVariable, default=None, configurable=False, description=description) class CodeAttrubute(Attribute): @@ -116,8 +116,10 @@ class CodeAttrubute(Attribute): Represents an attribute that will store generated source code block. """ - def __init__(self, name): - super(WeightAttribute, self).__init__(name, value_type=Source, default=None, configurable=False) + def __init__(self, name, description=None): + super(WeightAttribute, self).__init__( + name, value_type=Source, default=None, configurable=False, description=description + ) # endregion diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index 7dbeb4567e..45357344a6 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -26,6 +26,7 @@ WeightVariable, find_minimum_width, ) +from hls4ml.utils import attribute_descriptions as descriptions from hls4ml.utils.string_utils import convert_to_snake_case @@ -53,9 +54,9 @@ class Layer: """ _expected_attributes = [ - Attribute('index'), - ConfigurableAttribute('trace', default=False), - TypeAttribute('result'), + Attribute('index', description=descriptions.index), + ConfigurableAttribute('trace', default=False, description=descriptions.trace), + TypeAttribute('result', description=descriptions.result_type), ] @classproperty diff --git a/hls4ml/utils/attribute_descriptions.py b/hls4ml/utils/attribute_descriptions.py new file mode 100644 index 0000000000..92cee9e791 --- /dev/null +++ b/hls4ml/utils/attribute_descriptions.py @@ -0,0 +1,46 @@ +"""Strings holding attribute descriptions.""" + +# Common attributes + +reuse_factor = ( + 'The number of times each multiplier is used by controlling the amount of pipelining/unrolling. ' + 'Lower number results in more parallelism and lower latency at the expense of the resources used.' +) + +index = 'Internal node counter used for bookkeeping and variable/tensor naming.' +trace = 'Enables saving of layer output (tracing).' + +result_type = 'The datatype (precision) of the output tensor.' +accum_type = 'The datatype (precision) used to store intermediate results of the computation within the layer.' + +# Activation-related attributes + +table_size = 'The size of the lookup table used to approximate the function.' +table_type = 'The datatype (precision) used for the values of the lookup table.' + +softmax_implementation = ( + 'Choice of implementation of softmax function. ' + '"latency" provides good latency at the expense of extra resources. performs well on small number of classes. ' + '"stable" may require extra clock cycles but has better accuracy. ' + '"legacy" is the older implementation which has bad accuracy, but is fast and has low resource use. ' + 'It is superseded by the "latency" implementation for most applications. ' + '"argmax" is a special implementation that can be used if only the output with the highest probability is important. ' + 'Using this implementation will save resources and clock cycles.' +) +softmax_skip = 'If enabled, skips the softmax node and returns the raw outputs.' + +# Convolution-related attributes + +conv_pf = ( + 'The number of outputs computed in parallel. Essentially the number of multiplications of input window with the ' + 'convolution kernel occuring in parallel. ' + 'Higher number results in more parallelism (lower latency and II) at the expense of resources used.' +) +conv_implementation = '"LineBuffer" implementation is preferred over "Encoded" for most use cases.' + +# Recurrent-related attributes + +recurrent_static = ( + 'If set to True, will reuse the the same recurrent block for computation, resulting in lower resource ' + 'usage at the expense of serialized computation and higher latency/II.' +) From 1214b65a5e89ab6ce52429d131947014545aeeb0 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Wed, 13 Nov 2024 21:17:14 +0100 Subject: [PATCH 32/41] Pre-commit fix --- docs/attr_doc_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/attr_doc_gen.py b/docs/attr_doc_gen.py index 8bade86833..605c7669e4 100644 --- a/docs/attr_doc_gen.py +++ b/docs/attr_doc_gen.py @@ -12,7 +12,7 @@ class AttrList: def __init__(self, cls_name, cls_attrs) -> None: self.cls_name = cls_name - self.config_attrs = [attr for attr in cls_attrs if attr.configurable == True] + self.config_attrs = [attr for attr in cls_attrs if attr.configurable is True] self.type_attrs = [attr for attr in cls_attrs if attr.__class__.__name__ == 'TypeAttribute'] self.weight_attrs = [attr for attr in cls_attrs if attr.__class__.__name__ == 'WeightAttribute'] self.base_attrs = [attr for attr in cls_attrs if attr not in self.config_attrs + self.type_attrs + self.weight_attrs] From daae96d4f75f3a0ac065ca8fb27021d28fa723a7 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Wed, 13 Nov 2024 21:29:42 -0800 Subject: [PATCH 33/41] fix --- hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h | 2 +- hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h index e32e82135f..b5f29fda06 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h @@ -127,7 +127,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { mult[index_mult] = 0; } else { - mult[index_mult] = CONFIG_T::template product::product( + mult[index_mult] = CONFIG_T::mult_config::template product::product( data[index_data], weights[index_weight]); } } // end channel loop diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 51409dd102..b04485af9a 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -126,7 +126,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { mult[index_mult] = 0; } else { - mult[index_mult] = CONFIG_T::template product::product( + mult[index_mult] = CONFIG_T::mult_config::template product::product( data[index_data], weights[index_weight]); } } // end channel loop From e813d41fdedfc1d704446c7d4310e12b4ca47c7f Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Wed, 20 Nov 2024 22:11:41 +0100 Subject: [PATCH 34/41] Tweak writing of all attributes, allow writing only configurable attributes --- docs/attr_doc_gen.py | 154 ++++++++++++++++++++++++------------- hls4ml/model/attributes.py | 27 ++++++- hls4ml/model/types.py | 15 +++- 3 files changed, 137 insertions(+), 59 deletions(-) diff --git a/docs/attr_doc_gen.py b/docs/attr_doc_gen.py index 605c7669e4..0ba2a5b77e 100644 --- a/docs/attr_doc_gen.py +++ b/docs/attr_doc_gen.py @@ -4,10 +4,6 @@ import hls4ml.model.attributes as attributes import hls4ml.model.layers as layers -all_backends = backends.get_available_backends() -# Removing duplicates but preserving order -all_layers = list(dict.fromkeys(layers.layer_map.values())) - class AttrList: def __init__(self, cls_name, cls_attrs) -> None: @@ -17,29 +13,61 @@ def __init__(self, cls_name, cls_attrs) -> None: self.weight_attrs = [attr for attr in cls_attrs if attr.__class__.__name__ == 'WeightAttribute'] self.base_attrs = [attr for attr in cls_attrs if attr not in self.config_attrs + self.type_attrs + self.weight_attrs] self.backend_attrs = {} + self.reverse_backend_attrs = [] # Will hold (attr, backend_name) pairs, used temporarily + self.unique_backend_attrs = [] def add_backend_attrs(self, backend_name, backend_attrs): self.backend_attrs[backend_name] = backend_attrs + for attr in backend_attrs: + self.reverse_backend_attrs.append((attr, backend_name)) + + def sift_backend_attrs(self): + grouped_dict = {} + for attr, backend_name in self.reverse_backend_attrs: + if attr not in grouped_dict: + grouped_dict[attr] = [] + grouped_dict[attr].append(backend_name) + + for attr, backend_names in grouped_dict.items(): + attr.available_in = backend_names + self.unique_backend_attrs.append(attr) + + @property + def only_configurable(self): + all_attrs = self.config_attrs + self.type_attrs + self.unique_backend_attrs + return [attr for attr in all_attrs if attr.configurable is True] + -attr_map = [] +def convert_to_attr_list(): + all_backends = backends.get_available_backends() + # Removing duplicates but preserving order + all_layers = list(dict.fromkeys(layers.layer_map.values())) + all_layers_attrs = [] -for layer_cls in all_layers: - base_attrs = layer_cls.expected_attributes + for layer_cls in all_layers: + base_attrs = layer_cls.expected_attributes - attr_list = AttrList(layer_cls.__name__, base_attrs) + attr_list = AttrList(layer_cls.__name__, base_attrs) - for backend_name in all_backends: - backend = backends.get_backend(backend_name) + for backend_name in all_backends: + backend = backends.get_backend(backend_name) - backend_cls = backend.create_layer_class(layer_cls) - backend_attrs = backend_cls.expected_attributes + backend_cls = backend.create_layer_class(layer_cls) + backend_attrs = backend_cls.expected_attributes - diff_atts = [attr for attr in backend_attrs if attr not in base_attrs] # Sets are faster, but don't preserve order - if len(diff_atts) > 0: - attr_list.add_backend_attrs(backend.name, diff_atts) + diff_atts = [ + attr for attr in backend_attrs if attr not in base_attrs + ] # Sets are faster, but don't preserve order + if len(diff_atts) > 0: + attr_list.add_backend_attrs(backend.name, diff_atts) - attr_map.append(attr_list) + all_layers_attrs.append(attr_list) + + for attr_list in all_layers_attrs: + attr_list.sift_backend_attrs() + + return all_layers_attrs def print_attrs(attrs, file): @@ -60,40 +88,62 @@ def print_attrs(attrs, file): if attr.description is not None: file.write(' * ' + attr.description + '\n\n') + if hasattr(attr, 'available_in'): + file.write(' * Available in: ' + ', '.join(attr.available_in) + '\n\n') + + +def write_all_attributes(all_layers_attrs): + with open('attributes.rst', mode='w') as file: + file.write('================\n') + file.write('Layer attributes\n') + file.write('================\n\n\n') + + for attr_list in all_layers_attrs: + file.write(attr_list.cls_name + '\n') + file.write('=' * len(attr_list.cls_name) + '\n') + + if len(attr_list.base_attrs) > 0: + file.write('Base attributes\n') + file.write('---------------\n') + print_attrs(attr_list.type_attrs, file) + + if len(attr_list.type_attrs) > 0: + file.write('Type attributes\n') + file.write('---------------\n') + print_attrs(attr_list.base_attrs, file) + + if len(attr_list.weight_attrs) > 0: + file.write('Weight attributes\n') + file.write('-----------------\n') + print_attrs(attr_list.weight_attrs, file) + + if len(attr_list.config_attrs) > 0: + file.write('Configurable attributes\n') + file.write('-----------------------\n') + print_attrs(attr_list.config_attrs, file) + + if len(attr_list.backend_attrs) > 0: + file.write('Backend-specific attributes\n') + file.write('---------------------------\n') + print_attrs(attr_list.unique_backend_attrs, file) + + +def write_only_configurable(all_layers_attrs): + with open('attributes.rst', mode='w') as file: + file.write('================\n') + file.write('Layer attributes\n') + file.write('================\n\n\n') + + for attr_list in all_layers_attrs: + file.write(attr_list.cls_name + '\n') + file.write('=' * len(attr_list.cls_name) + '\n') + + config_attrs = attr_list.only_configurable + if len(config_attrs) > 0: + print_attrs(config_attrs, file) + -with open('attributes.rst', mode='w') as file: - file.write('================\n') - file.write('Layer attributes\n') - file.write('================\n\n\n') - - for attr_list in attr_map: - file.write(attr_list.cls_name + '\n') - file.write('=' * len(attr_list.cls_name) + '\n') - - if len(attr_list.base_attrs) > 0: - file.write('Base attributes\n') - file.write('---------------\n') - print_attrs(attr_list.type_attrs, file) - - if len(attr_list.type_attrs) > 0: - file.write('Type attributes\n') - file.write('---------------\n') - print_attrs(attr_list.base_attrs, file) - - if len(attr_list.weight_attrs) > 0: - file.write('Weight attributes\n') - file.write('-----------------\n') - print_attrs(attr_list.weight_attrs, file) - - if len(attr_list.config_attrs) > 0: - file.write('Configurable attributes\n') - file.write('-----------------------\n') - print_attrs(attr_list.config_attrs, file) - - if len(attr_list.backend_attrs) > 0: - file.write('Backend attributes\n') - file.write('-----------------------\n') - for backend, backend_attrs in attr_list.backend_attrs.items(): - file.write(backend + '\n') - file.write('^' * len(backend) + '\n') - print_attrs(backend_attrs, file) +if __name__ == '__main__': + all_layers_attrs = convert_to_attr_list() + write_all_attributes(all_layers_attrs) + # write_only_configurable(all_layers_attrs) diff --git a/hls4ml/model/attributes.py b/hls4ml/model/attributes.py index d4aef63409..d03d2bd108 100644 --- a/hls4ml/model/attributes.py +++ b/hls4ml/model/attributes.py @@ -60,6 +60,20 @@ def config_name(self): """ return convert_to_pascal_case(self.name) + def __eq__(self, other: object) -> bool: + if not isinstance(other, Attribute): + return NotImplemented + return ( + self.name == other.name + and self.value_type == other.value_type + and self.default == other.default + and self.configurable == other.configurable + and self.description == other.description + ) + + def __hash__(self) -> int: + return hash((self.name, self.value_type, self.default, self.configurable, self.description)) + class ConfigurableAttribute(Attribute): """ @@ -69,7 +83,7 @@ class ConfigurableAttribute(Attribute): when defining the expected attributes of layer classes. """ - def __init__(self, name, value_type=int, default=None, description=None): + def __init__(self, name, value_type=Integral, default=None, description=None): super().__init__(name, value_type, default, configurable=True, description=description) @@ -101,6 +115,13 @@ def __init__(self, name, choices, default=None, configurable=True, description=N def validate_value(self, value): return value in self.choices + def __eq__(self, other: object) -> bool: + base_eq = super().__eq__(other) + return base_eq and hasattr(other, 'choices') and set(self.choices) == set(other.choices) + + def __hash__(self) -> int: + return super().__hash__() ^ hash(tuple(sorted(self.choices))) + class WeightAttribute(Attribute): """ @@ -117,9 +138,7 @@ class CodeAttrubute(Attribute): """ def __init__(self, name, description=None): - super(WeightAttribute, self).__init__( - name, value_type=Source, default=None, configurable=False, description=description - ) + super().__init__(name, value_type=Source, default=None, configurable=False, description=description) # endregion diff --git a/hls4ml/model/types.py b/hls4ml/model/types.py index 9fb257a1ef..9d0a97440f 100644 --- a/hls4ml/model/types.py +++ b/hls4ml/model/types.py @@ -64,12 +64,15 @@ def __init__(self, width, signed): self.width = width self.signed = signed - def __eq__(self, other): + def __eq__(self, other: object) -> bool: eq = self.width == other.width eq = eq and self.signed == other.signed return eq + def __hash__(self) -> int: + return hash((self.width, self.signed)) + class IntegerPrecisionType(PrecisionType): """Arbitrary precision integer data type. @@ -89,12 +92,15 @@ def __str__(self): return typestring # Does this need to make sure other is also an IntegerPrecisionType? I could see a match between Fixed and Integer - def __eq__(self, other): + def __eq__(self, other: object) -> bool: if isinstance(other, IntegerPrecisionType): return super().__eq__(other) return False + def __hash__(self) -> int: + return super().__hash__() + @property def integer(self): return self.width @@ -186,7 +192,7 @@ def __str__(self): typestring = '{signed}fixed<{args}>'.format(signed='u' if not self.signed else '', args=args) return typestring - def __eq__(self, other): + def __eq__(self, other: object) -> bool: if isinstance(other, FixedPrecisionType): eq = super().__eq__(other) eq = eq and self.integer == other.integer @@ -197,6 +203,9 @@ def __eq__(self, other): return False + def __hash__(self) -> int: + return super().__hash__() ^ hash((self.integer, self.rounding_mode, self.saturation_mode, self.saturation_bits)) + class XnorPrecisionType(PrecisionType): """ From d56dc7349e94ad8b6aaefe5b0f22385d3d9a9e03 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Thu, 21 Nov 2024 21:28:41 -0800 Subject: [PATCH 35/41] vladimir comments --- .../vivado/passes/convolution_templates.py | 20 +++-- hls4ml/backends/vivado/passes/pointwise.py | 36 +-------- .../vivado/passes/pointwise_codegen.py | 78 +++++++++---------- .../templates/vitis/nnet_utils/nnet_conv1d.h | 32 +++++--- .../vitis/nnet_utils/nnet_conv1d_latency.h | 8 +- .../templates/vivado/nnet_utils/nnet_conv1d.h | 32 +++++--- .../vivado/nnet_utils/nnet_conv1d_latency.h | 8 +- .../vivado/nnet_utils/nnet_function_stubs.h | 10 +++ 8 files changed, 114 insertions(+), 110 deletions(-) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 02f13ef6f0..551e4f4167 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -60,6 +60,8 @@ typedef {config_t} mult_config; template using scale_index = nnet::{scale_index_type}; + template + using conv_kernel = nnet::{conv_fn}; }}; const ap_uint config{index}::pixels[] = {{{instructions}}};\n""" @@ -93,16 +95,24 @@ def format(self, node): else: params['fill_fn'] = 'FillConv1DBuffer' - if node.get_attr('filt_width') == 1 and node.model.config.get_config_value('IOType') == 'io_parallel': - params['pointwise_fn'] = f'pointwise_conv_{node.index}' + is_pointwise_parallel_latency = node.get_attr('filt_width') == 1 and node.get_attr('strategy').lower() == 'latency' and node.model.config.get_config_value('IOType') == 'io_parallel' + if is_pointwise_parallel_latency: + params['conv_fn'] = f'pointwise_conv_{node.index}' else: - params['pointwise_fn'] = 'PointwiseConv1D' + if node.get_attr('strategy').lower() == 'latency': + params['conv_fn'] = 'Conv1DLatency' + elif node.get_attr('strategy').lower() == 'resource': + params['conv_fn'] = 'Conv1DResource' conv_config = self.template.format(**params) mult_params = self._default_config_params(node) - mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width') - mult_params['n_out'] = node.get_attr('n_filt') + if is_pointwise_parallel_latency: + mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width') / mult_params['reuse'] + mult_params['n_out'] = node.get_attr('n_filt') / mult_params['reuse'] + else: + mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width') + mult_params['n_out'] = node.get_attr('n_filt') mult_params['nzeros'] = node.get_weights('weight').nzeros mult_params['product_type'] = get_backend('vivado').product_type( node.get_input_variable().type.precision, node.get_weights('weight').type.precision diff --git a/hls4ml/backends/vivado/passes/pointwise.py b/hls4ml/backends/vivado/passes/pointwise.py index 79a72c1e6a..34568b09f7 100644 --- a/hls4ml/backends/vivado/passes/pointwise.py +++ b/hls4ml/backends/vivado/passes/pointwise.py @@ -4,45 +4,13 @@ Conv1DFunctionTemplate, Conv2DConfigTemplate, Conv2DFunctionTemplate, + conv1d_config_template, conv2d_config_template, conv_mult_config_template, ) from hls4ml.model.layers import register_layer from hls4ml.model.optimizer import OptimizerPass -pointwise_conv1d_config_template = """struct config{index} : nnet::conv1d_config {{ - static const unsigned pad_left = {pad_left}; - static const unsigned pad_right = {pad_right}; - static const unsigned in_width = {in_width}; - static const unsigned n_chan = {n_chan}; - static const unsigned filt_width = {filt_width}; - static const unsigned kernel_size = filt_width; - static const unsigned n_filt = {n_filt}; - static const unsigned stride_width = {stride_width}; - static const unsigned dilation = {dilation}; - static const unsigned out_width = {out_width}; - static const unsigned reuse_factor = {reuse}; - static const unsigned n_zeros = {nzeros}; - static const bool store_weights_in_bram = false; - static const unsigned strategy = nnet::{strategy}; - static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation}; - static const unsigned min_width = {min_width}; - static const ap_uint pixels[min_width]; - static const unsigned n_partitions = {n_partitions}; - static const unsigned n_pixels = out_width / n_partitions; - template - using fill_buffer = nnet::{fill_fn}; - typedef {accum_t.name} accum_t; - typedef {bias_t.name} bias_t; - typedef {weight_t.name} weight_t; - typedef {config_t} mult_config; - template - using scale_index = nnet::{scale_index_type}; - template - using pointwise_conv = nnet::{pointwise_fn}; -}}; -const ap_uint config{index}::pixels[] = {{{instructions}}};\n""" - pointwise_conv1d_function_template = ( 'nnet::pointwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' ) @@ -57,7 +25,7 @@ class PointwiseConv1DConfigTemplate(Conv1DConfigTemplate): def __init__(self): super(Conv1DConfigTemplate, self).__init__(PointwiseConv1D) - self.template = pointwise_conv1d_config_template + self.template = conv1d_config_template self.mult_template = conv_mult_config_template diff --git a/hls4ml/backends/vivado/passes/pointwise_codegen.py b/hls4ml/backends/vivado/passes/pointwise_codegen.py index cb26fb6530..763b3e510c 100644 --- a/hls4ml/backends/vivado/passes/pointwise_codegen.py +++ b/hls4ml/backends/vivado/passes/pointwise_codegen.py @@ -15,48 +15,48 @@ def generate_pointwise_conv1d_fn(layer_idx, reuse_factor=1): """ generated_code = ( - "template\n" - "class pointwise_conv_{index} : public PointwiseConv1D {{\n" - " public:\n" - " static void pointwise_conv(\n" - " data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n" - " res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n" - " typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n" - " typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n" - " data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n" # noqa: E501 - " #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n" - " res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n" # noqa: E501 - " #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n" - " RFInputLoop:\n" - " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n" - " #pragma HLS UNROLL\n" - " InnerInputLoop:\n" - " for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n" - " #pragma HLS UNROLL\n" - " data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n" # noqa: E501 - " }}\n" - " }}\n\n" + 'template\n' + 'class pointwise_conv_{index} : public Conv1DKernel {{\n' + ' public:\n' + ' static void conv(\n' + ' data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n' + ' res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n' + ' typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n' + ' typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n' + ' data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n' # noqa: E501 + ' #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n' + ' res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n' # noqa: E501 + ' #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n' + ' RFInputLoop:\n' + ' for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n' + ' #pragma HLS UNROLL\n' + ' InnerInputLoop:\n' + ' for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n' + ' #pragma HLS UNROLL\n' + ' data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n' # noqa: E501 + ' }}\n' + ' }}\n\n' ).format(index=layer_idx) - indent = " " + indent = ' ' for i in range(reuse_factor): generated_code += indent generated_code += ( - f"pointwise_conv_1d_latency_cl(data_tmp[{i}], res_tmp[{i}], weights, biases);\n" + f'pointwise_conv_1d_latency_cl(data_tmp[{i}], res_tmp[{i}], weights, biases);\n' ) generated_code += ( - "\n" - " RFOutputLoop:\n" - " for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n" - " #pragma HLS UNROLL\n" - " InnerOutputLoop:\n" - " for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n" - " #pragma HLS UNROLL\n" - " res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n" # noqa: E501 - " }\n" - " }\n" - " }\n" - "};\n" + '\n' + ' RFOutputLoop:\n' + ' for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n' + ' #pragma HLS UNROLL\n' + ' InnerOutputLoop:\n' + ' for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n' + ' #pragma HLS UNROLL\n' + ' res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n' # noqa: E501 + ' }\n' + ' }\n' + ' }\n' + '};\n' ) return generated_code @@ -66,14 +66,10 @@ class GeneratePointwiseConv1D(OptimizerPass): '''Generates code for pointwise 1D convolution''' def match(self, node): - return isinstance(node, Conv1D) and node.model.config.get_config_value('IOType') == 'io_parallel' + return isinstance(node, Conv1D) and node.model.config.get_config_value('IOType') == 'io_parallel' and node.get_attr('filt_width') == 1 def transform(self, model, node): - node_class = node.__class__.__name__ - if '1D' in node_class: - self._generate_pointwise_conv1d(node) - else: - raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})') + self._generate_pointwise_conv1d(node) def _generate_pointwise_conv1d(self, node): code_str = generate_pointwise_conv1d_fn( diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h index 92b8571d88..46beeacb03 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h @@ -4,6 +4,7 @@ #include "nnet_common.h" #include "nnet_conv1d_latency.h" #include "nnet_conv1d_resource.h" +#include "nnet_function_stubs.h" #include namespace nnet { @@ -38,11 +39,7 @@ void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CO // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully. //#pragma HLS INLINE recursive - if (CONFIG_T::strategy == nnet::latency) { - conv_1d_latency_cl(data, res, weights, biases); - } else { - conv_1d_resource_cl(data, res, weights, biases); - } + CONFIG_T::template conv_kernel::conv(data, res, weights, biases); } template @@ -55,13 +52,28 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully. //#pragma HLS INLINE recursive - if (CONFIG_T::strategy == nnet::latency) { - // Use pointwise unrolled implementation - CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); - } else { + CONFIG_T::template conv_kernel::conv(data, res, weights, biases); +} + +template class Conv1DLatency : public Conv1DKernel { + public: + static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + //#pragma HLS INLINE region + conv_1d_latency_cl(data, res, weights, biases); + } +}; + +template class Conv1DResource : public Conv1DKernel { + public: + static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + //#pragma HLS INLINE region conv_1d_resource_cl(data, res, weights, biases); } -} +}; } // namespace nnet diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h index b5f29fda06..6006711d8f 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h @@ -107,9 +107,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - constexpr unsigned multiplier_limit = DIV_ROUNDUP( - CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor); -#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit // Convolve, saving all multiplication results to accumulate later ConvOut: @@ -159,8 +157,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c // Cast to "res_t" type for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - #pragma HLS UNROLL - res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); + #pragma HLS UNROLL + res[ii * CONFIG_T::n_filt + ff] = cast(acc[ii][ff]); } } } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index f0f1c133b9..72bce78067 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -4,6 +4,7 @@ #include "nnet_common.h" #include "nnet_conv1d_latency.h" #include "nnet_conv1d_resource.h" +#include "nnet_function_stubs.h" #include namespace nnet { @@ -37,11 +38,7 @@ void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CO typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { #pragma HLS INLINE region - if (CONFIG_T::strategy == nnet::latency) { - conv_1d_latency_cl(data, res, weights, biases); - } else { - conv_1d_resource_cl(data, res, weights, biases); - } + CONFIG_T::template conv_kernel::conv(data, res, weights, biases); } template @@ -53,13 +50,28 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], #pragma HLS INLINE region - if (CONFIG_T::strategy == nnet::latency) { - // Use pointwise unrolled implementation - CONFIG_T::template pointwise_conv::pointwise_conv(data, res, weights, biases); - } else { + CONFIG_T::template conv_kernel::conv(data, res, weights, biases); +} + +template class Conv1DLatency : public Conv1DKernel { + public: + static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + #pragma HLS INLINE region + conv_1d_latency_cl(data, res, weights, biases); + } +}; + +template class Conv1DResource : public Conv1DKernel { + public: + static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + #pragma HLS INLINE region conv_1d_resource_cl(data, res, weights, biases); } -} +}; } // namespace nnet diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index b04485af9a..004109d954 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -106,9 +106,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 // Limit multipliers to control parallelization - constexpr unsigned multiplier_limit = DIV_ROUNDUP( - CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor); -#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit // Convolve, saving all multiplication results to accumulate later ConvOut: @@ -158,8 +156,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c // Cast to "res_t" type for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - #pragma HLS UNROLL - res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); + #pragma HLS UNROLL + res[ii * CONFIG_T::n_filt + ff] = cast(acc[ii][ff]); } } } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h b/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h index 1316bbe776..8ce2381e06 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h @@ -37,6 +37,16 @@ template class DenseKernel { } }; +template class Conv1DKernel { + public: + static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + // To be implemented in subclasses + } +}; + } // namespace nnet #endif From dd021ecd672ca9a08674536f166ad3f149b1f305 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Fri, 22 Nov 2024 02:02:19 -0800 Subject: [PATCH 36/41] fix n_in/n_out --- hls4ml/backends/vivado/passes/convolution_templates.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 551e4f4167..af04df0ec0 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -108,8 +108,8 @@ def format(self, node): mult_params = self._default_config_params(node) if is_pointwise_parallel_latency: - mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width') / mult_params['reuse'] - mult_params['n_out'] = node.get_attr('n_filt') / mult_params['reuse'] + mult_params['n_in'] = int(node.get_attr('in_width') * node.get_attr('n_chan') * node.get_attr('filt_width') / mult_params['reuse']) + mult_params['n_out'] = int(node.get_attr('in_width') * node.get_attr('n_filt') / mult_params['reuse']) else: mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width') mult_params['n_out'] = node.get_attr('n_filt') From 93acaa6f7849f1a6e33eb3aa90cdc17ba0f1d2c4 Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Fri, 22 Nov 2024 08:20:52 -0800 Subject: [PATCH 37/41] pre-commit --- hls4ml/backends/vivado/passes/convolution_templates.py | 10 ++++++++-- hls4ml/backends/vivado/passes/pointwise_codegen.py | 6 +++++- .../templates/vitis/nnet_utils/nnet_conv1d_latency.h | 2 +- .../templates/vivado/nnet_utils/nnet_conv1d_latency.h | 2 +- .../templates/vivado/nnet_utils/nnet_function_stubs.h | 3 +-- 5 files changed, 16 insertions(+), 7 deletions(-) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index af04df0ec0..bd243da290 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -95,7 +95,11 @@ def format(self, node): else: params['fill_fn'] = 'FillConv1DBuffer' - is_pointwise_parallel_latency = node.get_attr('filt_width') == 1 and node.get_attr('strategy').lower() == 'latency' and node.model.config.get_config_value('IOType') == 'io_parallel' + is_pointwise_parallel_latency = ( + node.get_attr('filt_width') == 1 + and node.get_attr('strategy').lower() == 'latency' + and node.model.config.get_config_value('IOType') == 'io_parallel' + ) if is_pointwise_parallel_latency: params['conv_fn'] = f'pointwise_conv_{node.index}' else: @@ -108,7 +112,9 @@ def format(self, node): mult_params = self._default_config_params(node) if is_pointwise_parallel_latency: - mult_params['n_in'] = int(node.get_attr('in_width') * node.get_attr('n_chan') * node.get_attr('filt_width') / mult_params['reuse']) + mult_params['n_in'] = int( + node.get_attr('in_width') * node.get_attr('n_chan') * node.get_attr('filt_width') / mult_params['reuse'] + ) mult_params['n_out'] = int(node.get_attr('in_width') * node.get_attr('n_filt') / mult_params['reuse']) else: mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width') diff --git a/hls4ml/backends/vivado/passes/pointwise_codegen.py b/hls4ml/backends/vivado/passes/pointwise_codegen.py index 763b3e510c..d41d51f82f 100644 --- a/hls4ml/backends/vivado/passes/pointwise_codegen.py +++ b/hls4ml/backends/vivado/passes/pointwise_codegen.py @@ -66,7 +66,11 @@ class GeneratePointwiseConv1D(OptimizerPass): '''Generates code for pointwise 1D convolution''' def match(self, node): - return isinstance(node, Conv1D) and node.model.config.get_config_value('IOType') == 'io_parallel' and node.get_attr('filt_width') == 1 + return ( + isinstance(node, Conv1D) + and node.model.config.get_config_value('IOType') == 'io_parallel' + and node.get_attr('filt_width') == 1 + ) def transform(self, model, node): self._generate_pointwise_conv1d(node) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h index 6006711d8f..e166cdd470 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h @@ -157,7 +157,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c // Cast to "res_t" type for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - #pragma HLS UNROLL + #pragma HLS UNROLL res[ii * CONFIG_T::n_filt + ff] = cast(acc[ii][ff]); } } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index 004109d954..ef2f94dcaf 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -156,7 +156,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c // Cast to "res_t" type for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { - #pragma HLS UNROLL + #pragma HLS UNROLL res[ii * CONFIG_T::n_filt + ff] = cast(acc[ii][ff]); } } diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h b/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h index 8ce2381e06..97774bc95b 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h @@ -39,8 +39,7 @@ template class DenseKernel { template class Conv1DKernel { public: - static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { // To be implemented in subclasses From 1867dfca0fcf7c58547423437cdf10c36e05068e Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Mon, 25 Nov 2024 08:09:42 -0800 Subject: [PATCH 38/41] fix resource strategy --- hls4ml/backends/vivado/passes/convolution_templates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index bd243da290..e098107eae 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -105,7 +105,7 @@ def format(self, node): else: if node.get_attr('strategy').lower() == 'latency': params['conv_fn'] = 'Conv1DLatency' - elif node.get_attr('strategy').lower() == 'resource': + else: params['conv_fn'] = 'Conv1DResource' conv_config = self.template.format(**params) From 76d06e78d6eec022e52022613d06d6c9ece347fc Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 4 Dec 2024 12:56:00 -0600 Subject: [PATCH 39/41] add warning when moving scale fales --- hls4ml/model/optimizer/passes/move_scales.py | 30 ++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/hls4ml/model/optimizer/passes/move_scales.py b/hls4ml/model/optimizer/passes/move_scales.py index 43fcaa0da7..8fba1ec405 100644 --- a/hls4ml/model/optimizer/passes/move_scales.py +++ b/hls4ml/model/optimizer/passes/move_scales.py @@ -5,6 +5,8 @@ ''' +import warnings + import numpy as np from hls4ml.model.layers import ApplyAlpha, Constant, Conv, MatMul, Merge @@ -85,6 +87,9 @@ def transform(self, model, node): can_propagate = False if not can_propagate: + warnings.warn( + 'Failed to propagate quantization scales down MatMul node; model probably not suppored.', stacklevel=1 + ) return False model.remove_node(apply_alpha) @@ -124,6 +129,9 @@ def transform(self, model, node): try: bias = bias0 + bias1 except ValueError: + warnings.warn( + 'Failed to propagate quantization scales down Add node; model probably not suppored.', stacklevel=1 + ) return False model.remove_node(in0) @@ -169,6 +177,7 @@ def transform(self, model, node): model.insert_node(new_node) return True else: + warnings.warn('Failed to propagate quantization bias down Add node; model probably not suppored.', stacklevel=1) return False @@ -243,6 +252,9 @@ def transform(self, model, node): except ValueError: can_propagate = False if not can_propagate: + warnings.warn( + 'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1 + ) return False # to remove warning, since these get set again @@ -287,6 +299,9 @@ def transform(self, model, node): except ValueError: can_propagate = False if not can_propagate: + warnings.warn( + 'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1 + ) return False # to remove warning, since these get set again @@ -308,6 +323,9 @@ def transform(self, model, node): can_propagate = False if not can_propagate: + warnings.warn( + 'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1 + ) return False # to remove warning, since these get set again @@ -367,6 +385,9 @@ def transform(self, model, node): except ValueError: can_propagate = False if not can_propagate: + warnings.warn( + 'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1 + ) return False # to remove warning, since these get set again @@ -388,6 +409,9 @@ def transform(self, model, node): except ValueError: can_propagate = False if not can_propagate: + warnings.warn( + 'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1 + ) return False # to remove warning, since these get set again @@ -412,6 +436,9 @@ def transform(self, model, node): except ValueError: can_propagate = False if not can_propagate: + warnings.warn( + 'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1 + ) return False # to remove warning, since these get set again @@ -445,6 +472,9 @@ def transform(self, model, node): except ValueError: can_propagate = False if not can_propagate: + warnings.warn( + 'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1 + ) return False # to remove warning, since these get set again From 915d2e134337451cf473147328796fb15854e67a Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 4 Dec 2024 12:57:28 -0600 Subject: [PATCH 40/41] better handle cases when there is no previous node --- hls4ml/model/optimizer/passes/batchnorm_opt.py | 4 ++-- hls4ml/model/optimizer/passes/bn_fuse.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py index e18d79ff4a..26b7b18e38 100644 --- a/hls4ml/model/optimizer/passes/batchnorm_opt.py +++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py @@ -166,7 +166,7 @@ class FuseConsecutiveBatchNormalization(OptimizerPass): """ def match(self, node): - prev_node = node.get_input_node(node.inputs[0]) + prev_node = node.get_input_node() basic_match = ( isinstance(node, BatchNormalization) and isinstance(prev_node, BatchNormalization) @@ -194,7 +194,7 @@ def match(self, node): return False def transform(self, model, node): - prev_node = node.get_input_node(node.inputs[0]) + prev_node = node.get_input_node() prev_map = prev_node.get_output_use_map() if len(prev_map[prev_node.outputs[0]]) > 1: diff --git a/hls4ml/model/optimizer/passes/bn_fuse.py b/hls4ml/model/optimizer/passes/bn_fuse.py index 000d8380ce..be81d5fb3d 100644 --- a/hls4ml/model/optimizer/passes/bn_fuse.py +++ b/hls4ml/model/optimizer/passes/bn_fuse.py @@ -18,7 +18,7 @@ class FuseBatchNormalization(OptimizerPass): """ def match(self, node): - prev_node = node.get_input_node(node.inputs[0]) + prev_node = node.get_input_node() basic_match = ( isinstance(node, BatchNormalization) and isinstance(prev_node, (Dense, Conv1D, Conv2D)) From 88c1fe76418e79cf400acd6ef211a20ed2717f42 Mon Sep 17 00:00:00 2001 From: Benjamin Ramhorst <59868635+bo3z@users.noreply.github.com> Date: Wed, 4 Dec 2024 22:52:02 +0100 Subject: [PATCH 41/41] Minor doc improvements to attributes (#57) * Minor doc improvements to attributes * Minor fixes --- hls4ml/utils/attribute_descriptions.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/hls4ml/utils/attribute_descriptions.py b/hls4ml/utils/attribute_descriptions.py index 92cee9e791..756f276fa1 100644 --- a/hls4ml/utils/attribute_descriptions.py +++ b/hls4ml/utils/attribute_descriptions.py @@ -5,10 +5,11 @@ reuse_factor = ( 'The number of times each multiplier is used by controlling the amount of pipelining/unrolling. ' 'Lower number results in more parallelism and lower latency at the expense of the resources used.' + 'Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.' ) index = 'Internal node counter used for bookkeeping and variable/tensor naming.' -trace = 'Enables saving of layer output (tracing).' +trace = 'Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)' result_type = 'The datatype (precision) of the output tensor.' accum_type = 'The datatype (precision) used to store intermediate results of the computation within the layer.' @@ -35,8 +36,12 @@ 'The number of outputs computed in parallel. Essentially the number of multiplications of input window with the ' 'convolution kernel occuring in parallel. ' 'Higher number results in more parallelism (lower latency and II) at the expense of resources used.' + 'Currently only supported in io_parallel.' +) +conv_implementation = ( + '"LineBuffer" implementation is preferred over "Encoded" for most use cases. ' + 'This attribute only applies to io_stream.' ) -conv_implementation = '"LineBuffer" implementation is preferred over "Encoded" for most use cases.' # Recurrent-related attributes