From ea5c5a86fb2d353a90eb1300824b6529888b26d8 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Wed, 14 Jun 2023 08:47:30 -0700
Subject: [PATCH 01/41] merge

---
 hls4ml/templates/vivado/build_prj.tcl         |   2 +-
 .../templates/vivado/nnet_utils/nnet_common.h |   1 +
 .../templates/vivado/nnet_utils/nnet_conv1d.h |  16 +-
 .../vivado/nnet_utils/nnet_conv1d_latency.h   | 221 ++++++++++++++++++
 .../vivado/nnet_utils/nnet_conv_stream.h      |   2 -
 5 files changed, 237 insertions(+), 5 deletions(-)
diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl
index d34337c573..6383b910ca 100644
--- a/hls4ml/templates/vivado/build_prj.tcl
+++ b/hls4ml/templates/vivado/build_prj.tcl
@@ -161,7 +161,7 @@ if {$opt(reset)} {
 } else {
     open_solution "solution1"
 }
-catch {config_array_partition -maximum_size 4096}
+catch {config_array_partition -maximum_size 8192}
 config_compile -name_max_length 80
 set_part $part
 config_schedule -enable_dsp_full_reg=false
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
index fed0395a1a..b6582e1406 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
@@ -24,6 +24,7 @@ namespace nnet {
 // Common type definitions
 enum io_type { io_parallel = 0, io_stream };
 enum strategy { latency, resource };
+enum class conv_implementation { linebuffer=0, encoded=1, pointwise=2};
 
 /* ---
  * Balanced tree reduce implementation.
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
index e2e0211b49..c2990ea97a 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
@@ -53,9 +53,21 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
 
     #pragma HLS INLINE region
 
-    // Nothing special to be done for io_parallel implementation
     if (CONFIG_T::strategy == nnet::latency) {
-        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        if (CONFIG_T::implementation == conv_implementation::pointwise){
+            // Use pointwise unrolled implementation
+            if (CONFIG_T::reuse_factor > 1 && CONFIG_T::reuse_factor <= 120) {
+                pointwise_conv_1d_latency_cl_split_by_rf<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+            }
+            else {
+                assert(CONFIG_T::reuse_factor == 1);
+                pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+            }
+        }
+        else {
+            // Use standard unrolled implementation
+            conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        }
     } else {
         conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index 0d9afb10cb..8549ae9add 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -84,5 +84,226 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     }
 }
 
+template<class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_latency_cl(
+    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan/CONFIG_T::reuse_factor],
+    res_T  res[CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    assert(CONFIG_T::filt_width == 1);
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan/CONFIG_T::reuse_factor];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_width/CONFIG_T::reuse_factor][CONFIG_T::n_filt];
+
+    #pragma HLS ARRAY_PARTITION variable=mult complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=weights,biases
+
+    // Parallel mode
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
+
+    // Limit multipliers to control parallelization
+    //const int multiplier_limit = compute_multiplier_limit<CONFIG_T>(weights);
+    //#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+
+    // Convolve, saving all multiplication results to accumulate later
+    ConvOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) {
+        ConvFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            ConvChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                int index_mult   = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc;
+                int index_weight = cc*CONFIG_T::n_filt + ff;
+                int index_data   = (ii*CONFIG_T::stride_width-CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
+
+                if((ii*CONFIG_T::stride_width) < CONFIG_T::pad_left || (ii*CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)){
+                    mult[index_mult] = 0;
+                }
+                else {
+                    mult[index_mult] = data[index_data] * weights[index_weight];
+                }
+            }//end channel loop
+        }//end filter loop
+    }//end output loop
+
+
+    // Initialize accumulator with input biases
+    for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) {
+        for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            acc[ii][ff]=biases[ff];
+        }
+    }
+
+
+    // Accumulate multiplication result
+    AccumOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) {
+        AccumFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            //Do "dot product" sum within filter and sum over channels
+            AccumChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc;
+                acc[ii][ff] += mult[index_mult];
+            }//end channel loop
+        }//end filter loop
+    }//end output loop
+
+
+    // Cast to "res_t" type
+    for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) {
+        for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]);
+        }
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T> void pointwise_conv_1d_latency_cl_split_by_rf(
+    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T  res[CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+
+    data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor];
+    #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0
+    res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width*CONFIG_T::n_filt/CONFIG_T::reuse_factor];
+    #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0
+    
+    for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {
+        for(int ii = 0; ii < CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor; ii++) {
+            #pragma HLS UNROLL
+            data_tmp[jj][ii] = data[jj*CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor+ii];
+        }
+    }
+
+    pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[0], res_tmp[0], weights, biases);
+    pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[1], res_tmp[1], weights, biases);
+    if (CONFIG_T::reuse_factor > 2) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[2], res_tmp[2], weights, biases);
+    if (CONFIG_T::reuse_factor > 3) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[3], res_tmp[3], weights, biases);
+    if (CONFIG_T::reuse_factor > 4) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[4], res_tmp[4], weights, biases);
+    if (CONFIG_T::reuse_factor > 5) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[5], res_tmp[5], weights, biases);
+    if (CONFIG_T::reuse_factor > 6) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[6], res_tmp[6], weights, biases);
+    if (CONFIG_T::reuse_factor > 7) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[7], res_tmp[7], weights, biases);
+    if (CONFIG_T::reuse_factor > 8) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[8], res_tmp[8], weights, biases);
+    if (CONFIG_T::reuse_factor > 9) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[9], res_tmp[9], weights, biases);
+    if (CONFIG_T::reuse_factor > 10) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[10], res_tmp[10], weights, biases);
+    if (CONFIG_T::reuse_factor > 11) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[11], res_tmp[11], weights, biases);
+    if (CONFIG_T::reuse_factor > 12) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[12], res_tmp[12], weights, biases);
+    if (CONFIG_T::reuse_factor > 13) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[13], res_tmp[13], weights, biases);
+    if (CONFIG_T::reuse_factor > 14) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[14], res_tmp[14], weights, biases);
+    if (CONFIG_T::reuse_factor > 15) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[15], res_tmp[15], weights, biases);
+    if (CONFIG_T::reuse_factor > 16) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[16], res_tmp[16], weights, biases);
+    if (CONFIG_T::reuse_factor > 17) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[17], res_tmp[17], weights, biases);
+    if (CONFIG_T::reuse_factor > 18) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[18], res_tmp[18], weights, biases);
+    if (CONFIG_T::reuse_factor > 19) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[19], res_tmp[19], weights, biases);
+    if (CONFIG_T::reuse_factor > 20) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[20], res_tmp[20], weights, biases);
+    if (CONFIG_T::reuse_factor > 21) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[21], res_tmp[21], weights, biases);
+    if (CONFIG_T::reuse_factor > 22) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[22], res_tmp[22], weights, biases);
+    if (CONFIG_T::reuse_factor > 23) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[23], res_tmp[23], weights, biases);
+    if (CONFIG_T::reuse_factor > 24) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[24], res_tmp[24], weights, biases);
+    if (CONFIG_T::reuse_factor > 25) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[25], res_tmp[25], weights, biases);
+    if (CONFIG_T::reuse_factor > 26) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[26], res_tmp[26], weights, biases);
+    if (CONFIG_T::reuse_factor > 27) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[27], res_tmp[27], weights, biases);
+    if (CONFIG_T::reuse_factor > 28) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[28], res_tmp[28], weights, biases);
+    if (CONFIG_T::reuse_factor > 29) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[29], res_tmp[29], weights, biases);
+    if (CONFIG_T::reuse_factor > 30) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[30], res_tmp[30], weights, biases);
+    if (CONFIG_T::reuse_factor > 31) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[31], res_tmp[31], weights, biases);
+    if (CONFIG_T::reuse_factor > 32) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[32], res_tmp[32], weights, biases);
+    if (CONFIG_T::reuse_factor > 33) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[33], res_tmp[33], weights, biases);
+    if (CONFIG_T::reuse_factor > 34) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[34], res_tmp[34], weights, biases);
+    if (CONFIG_T::reuse_factor > 35) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[35], res_tmp[35], weights, biases);
+    if (CONFIG_T::reuse_factor > 36) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[36], res_tmp[36], weights, biases);
+    if (CONFIG_T::reuse_factor > 37) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[37], res_tmp[37], weights, biases);
+    if (CONFIG_T::reuse_factor > 38) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[38], res_tmp[38], weights, biases);
+    if (CONFIG_T::reuse_factor > 39) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[39], res_tmp[39], weights, biases);
+    if (CONFIG_T::reuse_factor > 40) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[40], res_tmp[40], weights, biases);
+    if (CONFIG_T::reuse_factor > 41) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[41], res_tmp[41], weights, biases);
+    if (CONFIG_T::reuse_factor > 42) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[42], res_tmp[42], weights, biases);
+    if (CONFIG_T::reuse_factor > 43) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[43], res_tmp[43], weights, biases);
+    if (CONFIG_T::reuse_factor > 44) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[44], res_tmp[44], weights, biases);
+    if (CONFIG_T::reuse_factor > 45) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[45], res_tmp[45], weights, biases);
+    if (CONFIG_T::reuse_factor > 46) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[46], res_tmp[45], weights, biases);
+    if (CONFIG_T::reuse_factor > 47) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[47], res_tmp[47], weights, biases);
+    if (CONFIG_T::reuse_factor > 48) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[48], res_tmp[48], weights, biases);
+    if (CONFIG_T::reuse_factor > 49) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[49], res_tmp[49], weights, biases);
+    if (CONFIG_T::reuse_factor > 50) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[50], res_tmp[50], weights, biases);
+    if (CONFIG_T::reuse_factor > 51) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[51], res_tmp[51], weights, biases);
+    if (CONFIG_T::reuse_factor > 52) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[52], res_tmp[52], weights, biases);
+    if (CONFIG_T::reuse_factor > 53) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[53], res_tmp[53], weights, biases);
+    if (CONFIG_T::reuse_factor > 54) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[54], res_tmp[54], weights, biases);
+    if (CONFIG_T::reuse_factor > 55) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[55], res_tmp[55], weights, biases);
+    if (CONFIG_T::reuse_factor > 56) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[56], res_tmp[55], weights, biases);
+    if (CONFIG_T::reuse_factor > 57) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[57], res_tmp[57], weights, biases);
+    if (CONFIG_T::reuse_factor > 58) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[58], res_tmp[58], weights, biases);
+    if (CONFIG_T::reuse_factor > 59) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[59], res_tmp[59], weights, biases);
+    if (CONFIG_T::reuse_factor > 60) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[60], res_tmp[60], weights, biases);
+    if (CONFIG_T::reuse_factor > 61) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[61], res_tmp[61], weights, biases);
+    if (CONFIG_T::reuse_factor > 62) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[62], res_tmp[62], weights, biases);
+    if (CONFIG_T::reuse_factor > 63) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[63], res_tmp[63], weights, biases);
+    if (CONFIG_T::reuse_factor > 64) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[64], res_tmp[64], weights, biases);
+    if (CONFIG_T::reuse_factor > 65) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[65], res_tmp[65], weights, biases);
+    if (CONFIG_T::reuse_factor > 66) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[66], res_tmp[66], weights, biases);
+    if (CONFIG_T::reuse_factor > 67) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[67], res_tmp[67], weights, biases);
+    if (CONFIG_T::reuse_factor > 68) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[68], res_tmp[68], weights, biases);
+    if (CONFIG_T::reuse_factor > 69) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[69], res_tmp[69], weights, biases);
+    if (CONFIG_T::reuse_factor > 70) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[70], res_tmp[70], weights, biases);
+    if (CONFIG_T::reuse_factor > 71) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[71], res_tmp[71], weights, biases);
+    if (CONFIG_T::reuse_factor > 72) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[72], res_tmp[72], weights, biases);
+    if (CONFIG_T::reuse_factor > 73) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[73], res_tmp[73], weights, biases);
+    if (CONFIG_T::reuse_factor > 74) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[74], res_tmp[74], weights, biases);
+    if (CONFIG_T::reuse_factor > 75) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[75], res_tmp[75], weights, biases);
+    if (CONFIG_T::reuse_factor > 76) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[76], res_tmp[76], weights, biases);
+    if (CONFIG_T::reuse_factor > 77) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[77], res_tmp[77], weights, biases);
+    if (CONFIG_T::reuse_factor > 78) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[78], res_tmp[78], weights, biases);
+    if (CONFIG_T::reuse_factor > 79) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[79], res_tmp[79], weights, biases);
+    if (CONFIG_T::reuse_factor > 80) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[80], res_tmp[80], weights, biases);
+    if (CONFIG_T::reuse_factor > 81) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[81], res_tmp[81], weights, biases);
+    if (CONFIG_T::reuse_factor > 82) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[82], res_tmp[82], weights, biases);
+    if (CONFIG_T::reuse_factor > 83) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[83], res_tmp[83], weights, biases);
+    if (CONFIG_T::reuse_factor > 84) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[84], res_tmp[84], weights, biases);
+    if (CONFIG_T::reuse_factor > 85) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[85], res_tmp[85], weights, biases);
+    if (CONFIG_T::reuse_factor > 86) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[86], res_tmp[86], weights, biases);
+    if (CONFIG_T::reuse_factor > 87) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[87], res_tmp[87], weights, biases);
+    if (CONFIG_T::reuse_factor > 88) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[88], res_tmp[88], weights, biases);
+    if (CONFIG_T::reuse_factor > 89) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[89], res_tmp[89], weights, biases);
+    if (CONFIG_T::reuse_factor > 90) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[90], res_tmp[90], weights, biases);
+    if (CONFIG_T::reuse_factor > 91) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[91], res_tmp[91], weights, biases);
+    if (CONFIG_T::reuse_factor > 92) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[92], res_tmp[92], weights, biases);
+    if (CONFIG_T::reuse_factor > 93) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[93], res_tmp[93], weights, biases);
+    if (CONFIG_T::reuse_factor > 94) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[94], res_tmp[94], weights, biases);
+    if (CONFIG_T::reuse_factor > 95) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[95], res_tmp[95], weights, biases);
+    if (CONFIG_T::reuse_factor > 96) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[96], res_tmp[96], weights, biases);
+    if (CONFIG_T::reuse_factor > 97) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[97], res_tmp[97], weights, biases);
+    if (CONFIG_T::reuse_factor > 98) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[98], res_tmp[98], weights, biases);
+    if (CONFIG_T::reuse_factor > 99) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[99], res_tmp[99], weights, biases);
+    if (CONFIG_T::reuse_factor > 100) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[100], res_tmp[100], weights, biases);
+    if (CONFIG_T::reuse_factor > 101) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[101], res_tmp[101], weights, biases);
+    if (CONFIG_T::reuse_factor > 102) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[102], res_tmp[102], weights, biases);
+    if (CONFIG_T::reuse_factor > 103) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[103], res_tmp[103], weights, biases);
+    if (CONFIG_T::reuse_factor > 104) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[104], res_tmp[104], weights, biases);
+    if (CONFIG_T::reuse_factor > 105) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[105], res_tmp[105], weights, biases);
+    if (CONFIG_T::reuse_factor > 106) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[106], res_tmp[106], weights, biases);
+    if (CONFIG_T::reuse_factor > 107) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[107], res_tmp[107], weights, biases);
+    if (CONFIG_T::reuse_factor > 108) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[108], res_tmp[108], weights, biases);
+    if (CONFIG_T::reuse_factor > 109) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[109], res_tmp[109], weights, biases);
+    if (CONFIG_T::reuse_factor > 110) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[110], res_tmp[110], weights, biases);
+    if (CONFIG_T::reuse_factor > 111) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[111], res_tmp[111], weights, biases);
+    if (CONFIG_T::reuse_factor > 112) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[112], res_tmp[112], weights, biases);
+    if (CONFIG_T::reuse_factor > 113) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[113], res_tmp[113], weights, biases);
+    if (CONFIG_T::reuse_factor > 114) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[114], res_tmp[114], weights, biases);
+    if (CONFIG_T::reuse_factor > 115) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[115], res_tmp[115], weights, biases);
+    if (CONFIG_T::reuse_factor > 116) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[116], res_tmp[116], weights, biases);
+    if (CONFIG_T::reuse_factor > 117) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[117], res_tmp[117], weights, biases);
+    if (CONFIG_T::reuse_factor > 118) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[118], res_tmp[118], weights, biases);
+    if (CONFIG_T::reuse_factor > 119) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[119], res_tmp[119], weights, biases);
+ 
+    for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {
+        for(int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor; ii++) {
+            #pragma HLS UNROLL
+            res[jj*CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];
+        }
+    }
+}
+
 } // namespace nnet
 #endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
index 7bd47442f6..b763938cb3 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
@@ -8,8 +8,6 @@
 
 namespace nnet {
 
-enum class conv_implementation { linebuffer = 0, encoded = 1 };
-
 // *************************************************
 //       Encoded Implementation (Vlad's)
 // *************************************************

From 6849e0b4d0a1b352cac1d61870273882dc112705 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Thu, 22 Dec 2022 16:21:25 -0600
Subject: [PATCH 02/41] add pointwise

---
 hls4ml/backends/vivado/vivado_backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 1d4c96d982..4dab5f5c18 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -72,7 +72,7 @@ def _register_layer_attributes(self):
         for layer in cnn_layers:
             attrs = self.attribute_map.get(layer, [])
             # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer'))
-            attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer'))
+            attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer'))
             self.attribute_map[layer] = attrs
 
     def _register_flows(self):

From 0244b666652e2667c8df72c134f9abd94c731685 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sat, 25 Mar 2023 18:29:44 -0700
Subject: [PATCH 03/41] latency

---
 hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
index c2990ea97a..e2dee3485a 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
@@ -66,7 +66,7 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
         }
         else {
             // Use standard unrolled implementation
-            conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+            conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
         }
     } else {
         conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);

From 3ae7752e70dc43d0687b39a90d7c4d0fd6f9b797 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sat, 25 Mar 2023 18:56:58 -0700
Subject: [PATCH 04/41] unroll

---
 hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index 8549ae9add..4179c1dde8 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -104,6 +104,7 @@ void pointwise_conv_1d_latency_cl(
 
     // Parallel mode
     #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    #pragma HLS ARRAY_PARTITION variable=weights complete dim=0
     #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
 
     // Limit multipliers to control parallelization
@@ -114,6 +115,7 @@ void pointwise_conv_1d_latency_cl(
     ConvOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) {
         ConvFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
             ConvChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+	        #pragma HLS UNROLL
                 int index_mult   = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc;
                 int index_weight = cc*CONFIG_T::n_filt + ff;
                 int index_data   = (ii*CONFIG_T::stride_width-CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
@@ -132,6 +134,7 @@ void pointwise_conv_1d_latency_cl(
     // Initialize accumulator with input biases
     for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) {
         for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
             acc[ii][ff]=biases[ff];
         }
     }
@@ -152,6 +155,7 @@ void pointwise_conv_1d_latency_cl(
     // Cast to "res_t" type
     for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) {
         for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
             res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]);
         }
     }
@@ -169,7 +173,9 @@ template<class data_T, class res_T, typename CONFIG_T> void pointwise_conv_1d_la
     res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width*CONFIG_T::n_filt/CONFIG_T::reuse_factor];
     #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0
     
+    RFInputLoop:
     for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {
+        InnerInputLoop:
         for(int ii = 0; ii < CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor; ii++) {
             #pragma HLS UNROLL
             data_tmp[jj][ii] = data[jj*CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor+ii];
@@ -297,7 +303,9 @@ template<class data_T, class res_T, typename CONFIG_T> void pointwise_conv_1d_la
     if (CONFIG_T::reuse_factor > 118) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[118], res_tmp[118], weights, biases);
     if (CONFIG_T::reuse_factor > 119) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[119], res_tmp[119], weights, biases);
  
+    RFOutputLoop:
     for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {
+        InnerOutputLoop:
         for(int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor; ii++) {
             #pragma HLS UNROLL
             res[jj*CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];

From 23126b70ca5496bcc7da993d95a8d939920bd8bc Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sun, 26 Mar 2023 17:19:08 -0700
Subject: [PATCH 05/41] add hls unroll

---
 hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index 4179c1dde8..c5b520c703 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -175,6 +175,7 @@ template<class data_T, class res_T, typename CONFIG_T> void pointwise_conv_1d_la
     
     RFInputLoop:
     for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {
+        #pragma HLS UNROLL
         InnerInputLoop:
         for(int ii = 0; ii < CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor; ii++) {
             #pragma HLS UNROLL
@@ -305,6 +306,7 @@ template<class data_T, class res_T, typename CONFIG_T> void pointwise_conv_1d_la
  
     RFOutputLoop:
     for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {
+        #pragma HLS UNROLL
         InnerOutputLoop:
         for(int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor; ii++) {
             #pragma HLS UNROLL

From 6aff9e996df95955d010013c2163a723ab8a8170 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Thu, 8 Jun 2023 08:15:11 -0700
Subject: [PATCH 06/41] fix pragma from walkie

---
 hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index c5b520c703..c423c7a228 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -108,8 +108,8 @@ void pointwise_conv_1d_latency_cl(
     #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
 
     // Limit multipliers to control parallelization
-    //const int multiplier_limit = compute_multiplier_limit<CONFIG_T>(weights);
-    //#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+    int multiplier_limit = ceil ( (float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor )* CONFIG_T::n_filt * CONFIG_T::n_chan ) / float(CONFIG_T::reuse_factor) );
+    #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
 
     // Convolve, saving all multiplication results to accumulate later
     ConvOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) {

From 7f1c318dea6767d5b0e4996786c356d48bfa4560 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 14 Jun 2023 18:46:37 +0000
Subject: [PATCH 07/41] [pre-commit.ci] auto fixes from pre-commit hooks

---
 hls4ml/backends/vivado/vivado_backend.py      |   4 +-
 .../templates/vivado/nnet_utils/nnet_common.h |   2 +-
 .../templates/vivado/nnet_utils/nnet_conv1d.h |   8 +-
 .../vivado/nnet_utils/nnet_conv1d_latency.h   | 488 +++++++++++-------
 4 files changed, 311 insertions(+), 191 deletions(-)

diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 4dab5f5c18..1eb58f0952 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -72,7 +72,9 @@ def _register_layer_attributes(self):
         for layer in cnn_layers:
             attrs = self.attribute_map.get(layer, [])
             # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer'))
-            attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer'))
+            attrs.append(
+                ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer')
+            )
             self.attribute_map[layer] = attrs
 
     def _register_flows(self):
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
index b6582e1406..e942a1dc89 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
@@ -24,7 +24,7 @@ namespace nnet {
 // Common type definitions
 enum io_type { io_parallel = 0, io_stream };
 enum strategy { latency, resource };
-enum class conv_implementation { linebuffer=0, encoded=1, pointwise=2};
+enum class conv_implementation { linebuffer = 0, encoded = 1, pointwise = 2 };
 
 /* ---
  * Balanced tree reduce implementation.
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
index e2dee3485a..0f2e89ac8f 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
@@ -54,17 +54,15 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     #pragma HLS INLINE region
 
     if (CONFIG_T::strategy == nnet::latency) {
-        if (CONFIG_T::implementation == conv_implementation::pointwise){
+        if (CONFIG_T::implementation == conv_implementation::pointwise) {
             // Use pointwise unrolled implementation
             if (CONFIG_T::reuse_factor > 1 && CONFIG_T::reuse_factor <= 120) {
                 pointwise_conv_1d_latency_cl_split_by_rf<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-            }
-            else {
+            } else {
                 assert(CONFIG_T::reuse_factor == 1);
                 pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
             }
-        }
-        else {
+        } else {
             // Use standard unrolled implementation
             conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
         }
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index c423c7a228..aabc869823 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -84,17 +84,15 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_latency_cl(
-    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan/CONFIG_T::reuse_factor],
-    res_T  res[CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor],
-    typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor],
+                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor],
+                                  typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     assert(CONFIG_T::filt_width == 1);
 
-    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan/CONFIG_T::reuse_factor];
-    typename CONFIG_T::accum_t acc[CONFIG_T::out_width/CONFIG_T::reuse_factor][CONFIG_T::n_filt];
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt];
 
     #pragma HLS ARRAY_PARTITION variable=mult complete dim=0
     #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
@@ -108,209 +106,331 @@ void pointwise_conv_1d_latency_cl(
     #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
 
     // Limit multipliers to control parallelization
-    int multiplier_limit = ceil ( (float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor )* CONFIG_T::n_filt * CONFIG_T::n_chan ) / float(CONFIG_T::reuse_factor) );
-    #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
-
-    // Convolve, saving all multiplication results to accumulate later
-    ConvOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) {
-        ConvFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-            ConvChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) {
-	        #pragma HLS UNROLL
-                int index_mult   = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc;
-                int index_weight = cc*CONFIG_T::n_filt + ff;
-                int index_data   = (ii*CONFIG_T::stride_width-CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
-
-                if((ii*CONFIG_T::stride_width) < CONFIG_T::pad_left || (ii*CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)){
+    int multiplier_limit =
+        ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) /
+             float(CONFIG_T::reuse_factor));
+#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
+
+// Convolve, saving all multiplication results to accumulate later
+ConvOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    ConvFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        ConvChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                #pragma HLS UNROLL
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                int index_weight = cc * CONFIG_T::n_filt + ff;
+                int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
+
+                if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left ||
+                    (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
                     mult[index_mult] = 0;
-                }
-                else {
+                } else {
                     mult[index_mult] = data[index_data] * weights[index_weight];
                 }
-            }//end channel loop
-        }//end filter loop
-    }//end output loop
-
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
 
     // Initialize accumulator with input biases
-    for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) {
-        for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
             #pragma HLS UNROLL
-            acc[ii][ff]=biases[ff];
+            acc[ii][ff] = biases[ff];
         }
     }
 
-
-    // Accumulate multiplication result
-    AccumOut: for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) {
-        AccumFilt: for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-            //Do "dot product" sum within filter and sum over channels
-            AccumChan: for(int cc = 0; cc < CONFIG_T::n_chan; cc++) {
-                int index_mult = ii*CONFIG_T::n_filt*CONFIG_T::n_chan + ff*CONFIG_T::n_chan + cc;
+// Accumulate multiplication result
+AccumOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    AccumFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Do "dot product" sum within filter and sum over channels
+        AccumChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
                 acc[ii][ff] += mult[index_mult];
-            }//end channel loop
-        }//end filter loop
-    }//end output loop
-
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
 
     // Cast to "res_t" type
-    for(int ii = 0; ii < CONFIG_T::out_width/CONFIG_T::reuse_factor; ii++) {
-        for(int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
             #pragma HLS UNROLL
             res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]);
         }
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T> void pointwise_conv_1d_latency_cl_split_by_rf(
-    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-    res_T  res[CONFIG_T::out_width * CONFIG_T::n_filt],
-    typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
-{
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_latency_cl_split_by_rf(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                                              res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                                              typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                              typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
 
-    data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor];
+    data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
     #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0
-    res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width*CONFIG_T::n_filt/CONFIG_T::reuse_factor];
+    res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];
     #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0
-    
-    RFInputLoop:
-    for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {
-        #pragma HLS UNROLL
-        InnerInputLoop:
-        for(int ii = 0; ii < CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor; ii++) {
+
+RFInputLoop:
+    for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {
+    #pragma HLS UNROLL
+    InnerInputLoop:
+        for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {
             #pragma HLS UNROLL
-            data_tmp[jj][ii] = data[jj*CONFIG_T::in_width*CONFIG_T::n_chan/CONFIG_T::reuse_factor+ii];
+            data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];
         }
     }
 
     pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[0], res_tmp[0], weights, biases);
     pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[1], res_tmp[1], weights, biases);
-    if (CONFIG_T::reuse_factor > 2) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[2], res_tmp[2], weights, biases);
-    if (CONFIG_T::reuse_factor > 3) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[3], res_tmp[3], weights, biases);
-    if (CONFIG_T::reuse_factor > 4) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[4], res_tmp[4], weights, biases);
-    if (CONFIG_T::reuse_factor > 5) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[5], res_tmp[5], weights, biases);
-    if (CONFIG_T::reuse_factor > 6) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[6], res_tmp[6], weights, biases);
-    if (CONFIG_T::reuse_factor > 7) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[7], res_tmp[7], weights, biases);
-    if (CONFIG_T::reuse_factor > 8) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[8], res_tmp[8], weights, biases);
-    if (CONFIG_T::reuse_factor > 9) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[9], res_tmp[9], weights, biases);
-    if (CONFIG_T::reuse_factor > 10) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[10], res_tmp[10], weights, biases);
-    if (CONFIG_T::reuse_factor > 11) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[11], res_tmp[11], weights, biases);
-    if (CONFIG_T::reuse_factor > 12) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[12], res_tmp[12], weights, biases);
-    if (CONFIG_T::reuse_factor > 13) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[13], res_tmp[13], weights, biases);
-    if (CONFIG_T::reuse_factor > 14) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[14], res_tmp[14], weights, biases);
-    if (CONFIG_T::reuse_factor > 15) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[15], res_tmp[15], weights, biases);
-    if (CONFIG_T::reuse_factor > 16) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[16], res_tmp[16], weights, biases);
-    if (CONFIG_T::reuse_factor > 17) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[17], res_tmp[17], weights, biases);
-    if (CONFIG_T::reuse_factor > 18) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[18], res_tmp[18], weights, biases);
-    if (CONFIG_T::reuse_factor > 19) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[19], res_tmp[19], weights, biases);
-    if (CONFIG_T::reuse_factor > 20) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[20], res_tmp[20], weights, biases);
-    if (CONFIG_T::reuse_factor > 21) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[21], res_tmp[21], weights, biases);
-    if (CONFIG_T::reuse_factor > 22) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[22], res_tmp[22], weights, biases);
-    if (CONFIG_T::reuse_factor > 23) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[23], res_tmp[23], weights, biases);
-    if (CONFIG_T::reuse_factor > 24) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[24], res_tmp[24], weights, biases);
-    if (CONFIG_T::reuse_factor > 25) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[25], res_tmp[25], weights, biases);
-    if (CONFIG_T::reuse_factor > 26) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[26], res_tmp[26], weights, biases);
-    if (CONFIG_T::reuse_factor > 27) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[27], res_tmp[27], weights, biases);
-    if (CONFIG_T::reuse_factor > 28) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[28], res_tmp[28], weights, biases);
-    if (CONFIG_T::reuse_factor > 29) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[29], res_tmp[29], weights, biases);
-    if (CONFIG_T::reuse_factor > 30) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[30], res_tmp[30], weights, biases);
-    if (CONFIG_T::reuse_factor > 31) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[31], res_tmp[31], weights, biases);
-    if (CONFIG_T::reuse_factor > 32) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[32], res_tmp[32], weights, biases);
-    if (CONFIG_T::reuse_factor > 33) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[33], res_tmp[33], weights, biases);
-    if (CONFIG_T::reuse_factor > 34) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[34], res_tmp[34], weights, biases);
-    if (CONFIG_T::reuse_factor > 35) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[35], res_tmp[35], weights, biases);
-    if (CONFIG_T::reuse_factor > 36) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[36], res_tmp[36], weights, biases);
-    if (CONFIG_T::reuse_factor > 37) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[37], res_tmp[37], weights, biases);
-    if (CONFIG_T::reuse_factor > 38) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[38], res_tmp[38], weights, biases);
-    if (CONFIG_T::reuse_factor > 39) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[39], res_tmp[39], weights, biases);
-    if (CONFIG_T::reuse_factor > 40) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[40], res_tmp[40], weights, biases);
-    if (CONFIG_T::reuse_factor > 41) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[41], res_tmp[41], weights, biases);
-    if (CONFIG_T::reuse_factor > 42) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[42], res_tmp[42], weights, biases);
-    if (CONFIG_T::reuse_factor > 43) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[43], res_tmp[43], weights, biases);
-    if (CONFIG_T::reuse_factor > 44) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[44], res_tmp[44], weights, biases);
-    if (CONFIG_T::reuse_factor > 45) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[45], res_tmp[45], weights, biases);
-    if (CONFIG_T::reuse_factor > 46) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[46], res_tmp[45], weights, biases);
-    if (CONFIG_T::reuse_factor > 47) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[47], res_tmp[47], weights, biases);
-    if (CONFIG_T::reuse_factor > 48) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[48], res_tmp[48], weights, biases);
-    if (CONFIG_T::reuse_factor > 49) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[49], res_tmp[49], weights, biases);
-    if (CONFIG_T::reuse_factor > 50) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[50], res_tmp[50], weights, biases);
-    if (CONFIG_T::reuse_factor > 51) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[51], res_tmp[51], weights, biases);
-    if (CONFIG_T::reuse_factor > 52) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[52], res_tmp[52], weights, biases);
-    if (CONFIG_T::reuse_factor > 53) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[53], res_tmp[53], weights, biases);
-    if (CONFIG_T::reuse_factor > 54) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[54], res_tmp[54], weights, biases);
-    if (CONFIG_T::reuse_factor > 55) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[55], res_tmp[55], weights, biases);
-    if (CONFIG_T::reuse_factor > 56) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[56], res_tmp[55], weights, biases);
-    if (CONFIG_T::reuse_factor > 57) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[57], res_tmp[57], weights, biases);
-    if (CONFIG_T::reuse_factor > 58) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[58], res_tmp[58], weights, biases);
-    if (CONFIG_T::reuse_factor > 59) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[59], res_tmp[59], weights, biases);
-    if (CONFIG_T::reuse_factor > 60) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[60], res_tmp[60], weights, biases);
-    if (CONFIG_T::reuse_factor > 61) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[61], res_tmp[61], weights, biases);
-    if (CONFIG_T::reuse_factor > 62) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[62], res_tmp[62], weights, biases);
-    if (CONFIG_T::reuse_factor > 63) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[63], res_tmp[63], weights, biases);
-    if (CONFIG_T::reuse_factor > 64) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[64], res_tmp[64], weights, biases);
-    if (CONFIG_T::reuse_factor > 65) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[65], res_tmp[65], weights, biases);
-    if (CONFIG_T::reuse_factor > 66) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[66], res_tmp[66], weights, biases);
-    if (CONFIG_T::reuse_factor > 67) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[67], res_tmp[67], weights, biases);
-    if (CONFIG_T::reuse_factor > 68) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[68], res_tmp[68], weights, biases);
-    if (CONFIG_T::reuse_factor > 69) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[69], res_tmp[69], weights, biases);
-    if (CONFIG_T::reuse_factor > 70) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[70], res_tmp[70], weights, biases);
-    if (CONFIG_T::reuse_factor > 71) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[71], res_tmp[71], weights, biases);
-    if (CONFIG_T::reuse_factor > 72) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[72], res_tmp[72], weights, biases);
-    if (CONFIG_T::reuse_factor > 73) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[73], res_tmp[73], weights, biases);
-    if (CONFIG_T::reuse_factor > 74) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[74], res_tmp[74], weights, biases);
-    if (CONFIG_T::reuse_factor > 75) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[75], res_tmp[75], weights, biases);
-    if (CONFIG_T::reuse_factor > 76) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[76], res_tmp[76], weights, biases);
-    if (CONFIG_T::reuse_factor > 77) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[77], res_tmp[77], weights, biases);
-    if (CONFIG_T::reuse_factor > 78) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[78], res_tmp[78], weights, biases);
-    if (CONFIG_T::reuse_factor > 79) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[79], res_tmp[79], weights, biases);
-    if (CONFIG_T::reuse_factor > 80) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[80], res_tmp[80], weights, biases);
-    if (CONFIG_T::reuse_factor > 81) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[81], res_tmp[81], weights, biases);
-    if (CONFIG_T::reuse_factor > 82) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[82], res_tmp[82], weights, biases);
-    if (CONFIG_T::reuse_factor > 83) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[83], res_tmp[83], weights, biases);
-    if (CONFIG_T::reuse_factor > 84) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[84], res_tmp[84], weights, biases);
-    if (CONFIG_T::reuse_factor > 85) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[85], res_tmp[85], weights, biases);
-    if (CONFIG_T::reuse_factor > 86) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[86], res_tmp[86], weights, biases);
-    if (CONFIG_T::reuse_factor > 87) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[87], res_tmp[87], weights, biases);
-    if (CONFIG_T::reuse_factor > 88) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[88], res_tmp[88], weights, biases);
-    if (CONFIG_T::reuse_factor > 89) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[89], res_tmp[89], weights, biases);
-    if (CONFIG_T::reuse_factor > 90) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[90], res_tmp[90], weights, biases);
-    if (CONFIG_T::reuse_factor > 91) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[91], res_tmp[91], weights, biases);
-    if (CONFIG_T::reuse_factor > 92) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[92], res_tmp[92], weights, biases);
-    if (CONFIG_T::reuse_factor > 93) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[93], res_tmp[93], weights, biases);
-    if (CONFIG_T::reuse_factor > 94) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[94], res_tmp[94], weights, biases);
-    if (CONFIG_T::reuse_factor > 95) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[95], res_tmp[95], weights, biases);
-    if (CONFIG_T::reuse_factor > 96) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[96], res_tmp[96], weights, biases);
-    if (CONFIG_T::reuse_factor > 97) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[97], res_tmp[97], weights, biases);
-    if (CONFIG_T::reuse_factor > 98) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[98], res_tmp[98], weights, biases);
-    if (CONFIG_T::reuse_factor > 99) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[99], res_tmp[99], weights, biases);
-    if (CONFIG_T::reuse_factor > 100) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[100], res_tmp[100], weights, biases);
-    if (CONFIG_T::reuse_factor > 101) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[101], res_tmp[101], weights, biases);
-    if (CONFIG_T::reuse_factor > 102) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[102], res_tmp[102], weights, biases);
-    if (CONFIG_T::reuse_factor > 103) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[103], res_tmp[103], weights, biases);
-    if (CONFIG_T::reuse_factor > 104) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[104], res_tmp[104], weights, biases);
-    if (CONFIG_T::reuse_factor > 105) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[105], res_tmp[105], weights, biases);
-    if (CONFIG_T::reuse_factor > 106) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[106], res_tmp[106], weights, biases);
-    if (CONFIG_T::reuse_factor > 107) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[107], res_tmp[107], weights, biases);
-    if (CONFIG_T::reuse_factor > 108) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[108], res_tmp[108], weights, biases);
-    if (CONFIG_T::reuse_factor > 109) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[109], res_tmp[109], weights, biases);
-    if (CONFIG_T::reuse_factor > 110) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[110], res_tmp[110], weights, biases);
-    if (CONFIG_T::reuse_factor > 111) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[111], res_tmp[111], weights, biases);
-    if (CONFIG_T::reuse_factor > 112) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[112], res_tmp[112], weights, biases);
-    if (CONFIG_T::reuse_factor > 113) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[113], res_tmp[113], weights, biases);
-    if (CONFIG_T::reuse_factor > 114) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[114], res_tmp[114], weights, biases);
-    if (CONFIG_T::reuse_factor > 115) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[115], res_tmp[115], weights, biases);
-    if (CONFIG_T::reuse_factor > 116) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[116], res_tmp[116], weights, biases);
-    if (CONFIG_T::reuse_factor > 117) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[117], res_tmp[117], weights, biases);
-    if (CONFIG_T::reuse_factor > 118) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[118], res_tmp[118], weights, biases);
-    if (CONFIG_T::reuse_factor > 119) pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[119], res_tmp[119], weights, biases);
- 
-    RFOutputLoop:
-    for(int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {
-        #pragma HLS UNROLL
-        InnerOutputLoop:
-        for(int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor; ii++) {
+    if (CONFIG_T::reuse_factor > 2)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[2], res_tmp[2], weights, biases);
+    if (CONFIG_T::reuse_factor > 3)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[3], res_tmp[3], weights, biases);
+    if (CONFIG_T::reuse_factor > 4)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[4], res_tmp[4], weights, biases);
+    if (CONFIG_T::reuse_factor > 5)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[5], res_tmp[5], weights, biases);
+    if (CONFIG_T::reuse_factor > 6)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[6], res_tmp[6], weights, biases);
+    if (CONFIG_T::reuse_factor > 7)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[7], res_tmp[7], weights, biases);
+    if (CONFIG_T::reuse_factor > 8)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[8], res_tmp[8], weights, biases);
+    if (CONFIG_T::reuse_factor > 9)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[9], res_tmp[9], weights, biases);
+    if (CONFIG_T::reuse_factor > 10)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[10], res_tmp[10], weights, biases);
+    if (CONFIG_T::reuse_factor > 11)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[11], res_tmp[11], weights, biases);
+    if (CONFIG_T::reuse_factor > 12)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[12], res_tmp[12], weights, biases);
+    if (CONFIG_T::reuse_factor > 13)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[13], res_tmp[13], weights, biases);
+    if (CONFIG_T::reuse_factor > 14)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[14], res_tmp[14], weights, biases);
+    if (CONFIG_T::reuse_factor > 15)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[15], res_tmp[15], weights, biases);
+    if (CONFIG_T::reuse_factor > 16)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[16], res_tmp[16], weights, biases);
+    if (CONFIG_T::reuse_factor > 17)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[17], res_tmp[17], weights, biases);
+    if (CONFIG_T::reuse_factor > 18)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[18], res_tmp[18], weights, biases);
+    if (CONFIG_T::reuse_factor > 19)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[19], res_tmp[19], weights, biases);
+    if (CONFIG_T::reuse_factor > 20)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[20], res_tmp[20], weights, biases);
+    if (CONFIG_T::reuse_factor > 21)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[21], res_tmp[21], weights, biases);
+    if (CONFIG_T::reuse_factor > 22)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[22], res_tmp[22], weights, biases);
+    if (CONFIG_T::reuse_factor > 23)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[23], res_tmp[23], weights, biases);
+    if (CONFIG_T::reuse_factor > 24)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[24], res_tmp[24], weights, biases);
+    if (CONFIG_T::reuse_factor > 25)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[25], res_tmp[25], weights, biases);
+    if (CONFIG_T::reuse_factor > 26)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[26], res_tmp[26], weights, biases);
+    if (CONFIG_T::reuse_factor > 27)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[27], res_tmp[27], weights, biases);
+    if (CONFIG_T::reuse_factor > 28)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[28], res_tmp[28], weights, biases);
+    if (CONFIG_T::reuse_factor > 29)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[29], res_tmp[29], weights, biases);
+    if (CONFIG_T::reuse_factor > 30)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[30], res_tmp[30], weights, biases);
+    if (CONFIG_T::reuse_factor > 31)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[31], res_tmp[31], weights, biases);
+    if (CONFIG_T::reuse_factor > 32)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[32], res_tmp[32], weights, biases);
+    if (CONFIG_T::reuse_factor > 33)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[33], res_tmp[33], weights, biases);
+    if (CONFIG_T::reuse_factor > 34)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[34], res_tmp[34], weights, biases);
+    if (CONFIG_T::reuse_factor > 35)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[35], res_tmp[35], weights, biases);
+    if (CONFIG_T::reuse_factor > 36)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[36], res_tmp[36], weights, biases);
+    if (CONFIG_T::reuse_factor > 37)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[37], res_tmp[37], weights, biases);
+    if (CONFIG_T::reuse_factor > 38)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[38], res_tmp[38], weights, biases);
+    if (CONFIG_T::reuse_factor > 39)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[39], res_tmp[39], weights, biases);
+    if (CONFIG_T::reuse_factor > 40)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[40], res_tmp[40], weights, biases);
+    if (CONFIG_T::reuse_factor > 41)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[41], res_tmp[41], weights, biases);
+    if (CONFIG_T::reuse_factor > 42)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[42], res_tmp[42], weights, biases);
+    if (CONFIG_T::reuse_factor > 43)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[43], res_tmp[43], weights, biases);
+    if (CONFIG_T::reuse_factor > 44)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[44], res_tmp[44], weights, biases);
+    if (CONFIG_T::reuse_factor > 45)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[45], res_tmp[45], weights, biases);
+    if (CONFIG_T::reuse_factor > 46)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[46], res_tmp[45], weights, biases);
+    if (CONFIG_T::reuse_factor > 47)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[47], res_tmp[47], weights, biases);
+    if (CONFIG_T::reuse_factor > 48)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[48], res_tmp[48], weights, biases);
+    if (CONFIG_T::reuse_factor > 49)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[49], res_tmp[49], weights, biases);
+    if (CONFIG_T::reuse_factor > 50)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[50], res_tmp[50], weights, biases);
+    if (CONFIG_T::reuse_factor > 51)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[51], res_tmp[51], weights, biases);
+    if (CONFIG_T::reuse_factor > 52)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[52], res_tmp[52], weights, biases);
+    if (CONFIG_T::reuse_factor > 53)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[53], res_tmp[53], weights, biases);
+    if (CONFIG_T::reuse_factor > 54)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[54], res_tmp[54], weights, biases);
+    if (CONFIG_T::reuse_factor > 55)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[55], res_tmp[55], weights, biases);
+    if (CONFIG_T::reuse_factor > 56)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[56], res_tmp[55], weights, biases);
+    if (CONFIG_T::reuse_factor > 57)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[57], res_tmp[57], weights, biases);
+    if (CONFIG_T::reuse_factor > 58)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[58], res_tmp[58], weights, biases);
+    if (CONFIG_T::reuse_factor > 59)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[59], res_tmp[59], weights, biases);
+    if (CONFIG_T::reuse_factor > 60)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[60], res_tmp[60], weights, biases);
+    if (CONFIG_T::reuse_factor > 61)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[61], res_tmp[61], weights, biases);
+    if (CONFIG_T::reuse_factor > 62)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[62], res_tmp[62], weights, biases);
+    if (CONFIG_T::reuse_factor > 63)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[63], res_tmp[63], weights, biases);
+    if (CONFIG_T::reuse_factor > 64)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[64], res_tmp[64], weights, biases);
+    if (CONFIG_T::reuse_factor > 65)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[65], res_tmp[65], weights, biases);
+    if (CONFIG_T::reuse_factor > 66)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[66], res_tmp[66], weights, biases);
+    if (CONFIG_T::reuse_factor > 67)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[67], res_tmp[67], weights, biases);
+    if (CONFIG_T::reuse_factor > 68)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[68], res_tmp[68], weights, biases);
+    if (CONFIG_T::reuse_factor > 69)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[69], res_tmp[69], weights, biases);
+    if (CONFIG_T::reuse_factor > 70)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[70], res_tmp[70], weights, biases);
+    if (CONFIG_T::reuse_factor > 71)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[71], res_tmp[71], weights, biases);
+    if (CONFIG_T::reuse_factor > 72)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[72], res_tmp[72], weights, biases);
+    if (CONFIG_T::reuse_factor > 73)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[73], res_tmp[73], weights, biases);
+    if (CONFIG_T::reuse_factor > 74)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[74], res_tmp[74], weights, biases);
+    if (CONFIG_T::reuse_factor > 75)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[75], res_tmp[75], weights, biases);
+    if (CONFIG_T::reuse_factor > 76)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[76], res_tmp[76], weights, biases);
+    if (CONFIG_T::reuse_factor > 77)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[77], res_tmp[77], weights, biases);
+    if (CONFIG_T::reuse_factor > 78)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[78], res_tmp[78], weights, biases);
+    if (CONFIG_T::reuse_factor > 79)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[79], res_tmp[79], weights, biases);
+    if (CONFIG_T::reuse_factor > 80)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[80], res_tmp[80], weights, biases);
+    if (CONFIG_T::reuse_factor > 81)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[81], res_tmp[81], weights, biases);
+    if (CONFIG_T::reuse_factor > 82)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[82], res_tmp[82], weights, biases);
+    if (CONFIG_T::reuse_factor > 83)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[83], res_tmp[83], weights, biases);
+    if (CONFIG_T::reuse_factor > 84)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[84], res_tmp[84], weights, biases);
+    if (CONFIG_T::reuse_factor > 85)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[85], res_tmp[85], weights, biases);
+    if (CONFIG_T::reuse_factor > 86)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[86], res_tmp[86], weights, biases);
+    if (CONFIG_T::reuse_factor > 87)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[87], res_tmp[87], weights, biases);
+    if (CONFIG_T::reuse_factor > 88)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[88], res_tmp[88], weights, biases);
+    if (CONFIG_T::reuse_factor > 89)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[89], res_tmp[89], weights, biases);
+    if (CONFIG_T::reuse_factor > 90)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[90], res_tmp[90], weights, biases);
+    if (CONFIG_T::reuse_factor > 91)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[91], res_tmp[91], weights, biases);
+    if (CONFIG_T::reuse_factor > 92)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[92], res_tmp[92], weights, biases);
+    if (CONFIG_T::reuse_factor > 93)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[93], res_tmp[93], weights, biases);
+    if (CONFIG_T::reuse_factor > 94)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[94], res_tmp[94], weights, biases);
+    if (CONFIG_T::reuse_factor > 95)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[95], res_tmp[95], weights, biases);
+    if (CONFIG_T::reuse_factor > 96)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[96], res_tmp[96], weights, biases);
+    if (CONFIG_T::reuse_factor > 97)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[97], res_tmp[97], weights, biases);
+    if (CONFIG_T::reuse_factor > 98)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[98], res_tmp[98], weights, biases);
+    if (CONFIG_T::reuse_factor > 99)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[99], res_tmp[99], weights, biases);
+    if (CONFIG_T::reuse_factor > 100)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[100], res_tmp[100], weights, biases);
+    if (CONFIG_T::reuse_factor > 101)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[101], res_tmp[101], weights, biases);
+    if (CONFIG_T::reuse_factor > 102)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[102], res_tmp[102], weights, biases);
+    if (CONFIG_T::reuse_factor > 103)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[103], res_tmp[103], weights, biases);
+    if (CONFIG_T::reuse_factor > 104)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[104], res_tmp[104], weights, biases);
+    if (CONFIG_T::reuse_factor > 105)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[105], res_tmp[105], weights, biases);
+    if (CONFIG_T::reuse_factor > 106)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[106], res_tmp[106], weights, biases);
+    if (CONFIG_T::reuse_factor > 107)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[107], res_tmp[107], weights, biases);
+    if (CONFIG_T::reuse_factor > 108)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[108], res_tmp[108], weights, biases);
+    if (CONFIG_T::reuse_factor > 109)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[109], res_tmp[109], weights, biases);
+    if (CONFIG_T::reuse_factor > 110)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[110], res_tmp[110], weights, biases);
+    if (CONFIG_T::reuse_factor > 111)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[111], res_tmp[111], weights, biases);
+    if (CONFIG_T::reuse_factor > 112)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[112], res_tmp[112], weights, biases);
+    if (CONFIG_T::reuse_factor > 113)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[113], res_tmp[113], weights, biases);
+    if (CONFIG_T::reuse_factor > 114)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[114], res_tmp[114], weights, biases);
+    if (CONFIG_T::reuse_factor > 115)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[115], res_tmp[115], weights, biases);
+    if (CONFIG_T::reuse_factor > 116)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[116], res_tmp[116], weights, biases);
+    if (CONFIG_T::reuse_factor > 117)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[117], res_tmp[117], weights, biases);
+    if (CONFIG_T::reuse_factor > 118)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[118], res_tmp[118], weights, biases);
+    if (CONFIG_T::reuse_factor > 119)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[119], res_tmp[119], weights, biases);
+
+RFOutputLoop:
+    for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {
+    #pragma HLS UNROLL
+    InnerOutputLoop:
+        for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {
             #pragma HLS UNROLL
-            res[jj*CONFIG_T::out_width * CONFIG_T::n_filt/CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];
+            res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];
         }
     }
 }

From 69aecc6dc187a6e9a1ecdd2e7449629f1a88e87b Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Wed, 14 Jun 2023 17:27:20 -0700
Subject: [PATCH 08/41] add test

---
 hls4ml/backends/vivado/vivado_backend.py |  1 -
 test/pytest/test_pointwiseconv.py        | 37 ++++++++++++------------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 1eb58f0952..1a99d90a8e 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -71,7 +71,6 @@ def _register_layer_attributes(self):
 
         for layer in cnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer'))
             attrs.append(
                 ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer')
             )
diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py
index 28314fe130..080106955e 100644
--- a/test/pytest/test_pointwiseconv.py
+++ b/test/pytest/test_pointwiseconv.py
@@ -21,20 +21,22 @@
 @pytest.mark.parametrize('padds', padds_options)
 @pytest.mark.parametrize('strides', strides1d_options)
 @pytest.mark.parametrize(
-    'backend, io_type, strategy',
+    'backend, io_type, strategy, conv_implementation',
     [
-        ('Quartus', 'io_parallel', 'resource'),
-        ('Vivado', 'io_parallel', 'resource'),
-        ('Vitis', 'io_parallel', 'resource'),
-        ('Vivado', 'io_parallel', 'latency'),
-        ('Vitis', 'io_parallel', 'latency'),
-        ('Vivado', 'io_stream', 'latency'),
-        ('Vivado', 'io_stream', 'resource'),
-        ('Vitis', 'io_stream', 'latency'),
-        ('Vitis', 'io_stream', 'resource'),
+        ('Quartus', 'io_parallel', 'resource', 'LineBuffer'),
+        ('Vivado', 'io_parallel', 'resource', 'LineBuffer'),
+        ('Vitis', 'io_parallel', 'resource', 'LineBuffer'),
+        ('Vivado', 'io_parallel', 'latency', 'LineBuffer'),
+        ('Vitis', 'io_parallel', 'latency', 'LineBuffer'),
+        ('Vivado', 'io_parallel', 'latency', 'Pointwise'),
+        ('Vitis', 'io_parallel', 'latency', 'Pointwise'),
+        ('Vivado', 'io_stream', 'latency', 'LineBuffer'),
+        ('Vivado', 'io_stream', 'resource', 'LineBuffer'),
+        ('Vitis', 'io_stream', 'latency', 'LineBuffer'),
+        ('Vitis', 'io_stream', 'resource', 'LineBuffer'),
     ],
 )
-def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
+def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_implementation):
     model = tf.keras.models.Sequential()
     input_shape = (28, 3)
     model.add(
@@ -47,6 +49,7 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
             kernel_initializer='normal',
             use_bias=False,
             data_format=chans,
+            name='pointwise1d'
         )
     )
     model.compile(optimizer='adam', loss='mse')
@@ -55,14 +58,13 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
     keras_prediction = model.predict(X_input)
 
     default_precision = 'ac_fixed<32,16,true>' if backend == 'Quartus' else 'ap_fixed<32,16>'
-    config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision)
+    config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision, granularity='name')
     config['Model']['Strategy'] = strategy
+    config['LayerName']['pointwise1d']['ConvImplementation'] = conv_implementation
 
     output_dir = str(
         test_root_path
-        / 'hls4mlprj_pointwise1d_{}_strides_{}_{}_padding_{}_{}_{}'.format(
-            chans, strides[0], padds, backend, io_type, strategy
-        )
+        / f'hls4mlprj_pointwise1d_{chans}_strides_{strides[0]}_{padds}_padding_{backend}_{io_type}_{strategy}_{conv_implementation}'
     )
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend
@@ -100,6 +102,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy):
             kernel_initializer='normal',
             use_bias=False,
             data_format=chans,
+            name='pointwise2d'
         )
     )
 
@@ -114,9 +117,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy):
     stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '')
     output_dir = str(
         test_root_path
-        / 'hls4mlprj_pointwise2d_{}_strides_{}_{}_padding_{}_{}_{}'.format(
-            chans, stride_cfg, padds, backend, io_type, strategy
-        )
+        / f'hls4mlprj_pointwise2d_{chans}_strides_{stride_cfg}_{padds}_padding_{backend}_{io_type}_{strategy}'
     )
 
     hls_model = hls4ml.converters.convert_from_keras_model(

From 4febceded10000b3b1b6b4254c9b9c230a9f475c Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Wed, 14 Jun 2023 17:48:44 -0700
Subject: [PATCH 09/41] pre-commit

---
 test/pytest/test_pointwiseconv.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py
index 080106955e..0cb75b7a87 100644
--- a/test/pytest/test_pointwiseconv.py
+++ b/test/pytest/test_pointwiseconv.py
@@ -49,7 +49,7 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv
             kernel_initializer='normal',
             use_bias=False,
             data_format=chans,
-            name='pointwise1d'
+            name='pointwise1d',
         )
     )
     model.compile(optimizer='adam', loss='mse')
@@ -102,7 +102,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy):
             kernel_initializer='normal',
             use_bias=False,
             data_format=chans,
-            name='pointwise2d'
+            name='pointwise2d',
         )
     )
 
@@ -116,8 +116,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy):
     config['Model']['Strategy'] = strategy
     stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '')
     output_dir = str(
-        test_root_path
-        / f'hls4mlprj_pointwise2d_{chans}_strides_{stride_cfg}_{padds}_padding_{backend}_{io_type}_{strategy}'
+        test_root_path / f'hls4mlprj_pointwise2d_{chans}_strides_{stride_cfg}_{padds}_padding_{backend}_{io_type}_{strategy}'
     )
 
     hls_model = hls4ml.converters.convert_from_keras_model(

From 56797e73ecb1a830c28128387536308fd3f50beb Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Wed, 14 Jun 2023 17:53:37 -0700
Subject: [PATCH 10/41] pre-commit

---
 test/pytest/test_pointwiseconv.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py
index 0cb75b7a87..cbe2036712 100644
--- a/test/pytest/test_pointwiseconv.py
+++ b/test/pytest/test_pointwiseconv.py
@@ -21,7 +21,7 @@
 @pytest.mark.parametrize('padds', padds_options)
 @pytest.mark.parametrize('strides', strides1d_options)
 @pytest.mark.parametrize(
-    'backend, io_type, strategy, conv_implementation',
+    'backend, io_type, strategy, conv_impl',
     [
         ('Quartus', 'io_parallel', 'resource', 'LineBuffer'),
         ('Vivado', 'io_parallel', 'resource', 'LineBuffer'),
@@ -36,7 +36,7 @@
         ('Vitis', 'io_stream', 'resource', 'LineBuffer'),
     ],
 )
-def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_implementation):
+def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_impl):
     model = tf.keras.models.Sequential()
     input_shape = (28, 3)
     model.add(
@@ -60,11 +60,11 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv
     default_precision = 'ac_fixed<32,16,true>' if backend == 'Quartus' else 'ap_fixed<32,16>'
     config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision, granularity='name')
     config['Model']['Strategy'] = strategy
-    config['LayerName']['pointwise1d']['ConvImplementation'] = conv_implementation
+    config['LayerName']['pointwise1d']['ConvImplementation'] = conv_impl
 
     output_dir = str(
         test_root_path
-        / f'hls4mlprj_pointwise1d_{chans}_strides_{strides[0]}_{padds}_padding_{backend}_{io_type}_{strategy}_{conv_implementation}'
+        / f'hls4mlprj_pointwise1d_{chans}_strides_{strides[0]}_{padds}_padding_{backend}_{io_type}_{strategy}_{conv_impl}'
     )
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend

From a01080dc210ef23640b766f0b9a24090ac540f58 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sat, 7 Oct 2023 22:09:14 -0700
Subject: [PATCH 11/41] use code gen

---
 hls4ml/backends/fpga/fpga_backend.py          |  57 +++
 hls4ml/backends/fpga/passes/codegen.py        |  22 ++
 .../vivado/passes/convolution_templates.py    |   7 +
 .../vivado/nnet_utils/nnet_code_gen.h         |  90 +++++
 .../templates/vivado/nnet_utils/nnet_conv1d.h |   4 +-
 .../vivado/nnet_utils/nnet_conv1d_latency.h   | 351 ------------------
 test/pytest/test_pointwiseconv.py             |   7 +-
 7 files changed, 183 insertions(+), 355 deletions(-)

diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
index 8cfaec8b3f..349a5ddbc8 100644
--- a/hls4ml/backends/fpga/fpga_backend.py
+++ b/hls4ml/backends/fpga/fpga_backend.py
@@ -860,6 +860,63 @@ def generate_conv2d_line_buffer_fn(
 
         return generated_code
 
+    def generate_pointwise_conv1d_fn(self, layer_idx, reuse_factor=1):
+        """Generate a C++ function for a pointwise convolution layer.
+
+        Args:
+            layer_idx (int): Index of layer ('index' attribute).
+            reuse_factor (int): Number of partitions to divide the input into.
+
+        Returns:
+            str: Generated C++ function
+        """
+
+        generated_code = (
+            "template<class data_T, class res_T, typename CONFIG_T>\n"
+            "class pointwise_conv_{index} : public PointwiseConv1D<data_T, res_T, CONFIG_T> {{\n"
+            "    public:\n"
+            "    static void pointwise_conv(\n"
+            "    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n"
+            "    res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n"
+            "    typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n"
+            "    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n"
+            "    data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n"
+            "    #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n"
+            "    res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n"
+            "    #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n"
+            "RFInputLoop:\n"
+            "    for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n"
+            "    #pragma HLS UNROLL\n"
+            "    InnerInputLoop:\n"
+            "        for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n"
+            "            #pragma HLS UNROLL\n"
+            "            data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];"
+            "\n"
+            "        }}\n"
+            "    }}\n\n"
+        ).format(index=layer_idx)
+        for i in range(reuse_factor):
+            generated_code += (
+                f"    pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[{i}], res_tmp[{i}], weights, biases);\n"
+            )
+
+        generated_code += (
+            "\n"
+            "RFOutputLoop:\n"
+            "    for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n"
+            "    #pragma HLS UNROLL\n"
+            "    InnerOutputLoop:\n"
+            "        for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n"
+            "            #pragma HLS UNROLL\n"
+            "            res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n"
+            "        }\n"
+            "    }\n"
+            "}\n"
+            "};\n"
+        )
+
+        return generated_code
+
     @model_optimizer()
     def write_hls(self, model):
         self.writer.write_hls(model)
diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/codegen.py
index f1f1080996..6d7243dd8b 100644
--- a/hls4ml/backends/fpga/passes/codegen.py
+++ b/hls4ml/backends/fpga/passes/codegen.py
@@ -49,3 +49,25 @@ def _generate_im2col_2d(self, node):
         )
 
         node.set_attr('line_buffer_codegen', Source(code_str))
+
+
+class GeneratePointwiseConv1D(OptimizerPass):
+    '''Generates code for pointwise 1D convolution'''
+
+    def match(self, node):
+        return isinstance(node, Conv1D) and node.model.config.get_config_value('IOType') == 'io_parallel'
+
+    def transform(self, model, node):
+        node_class = node.__class__.__name__
+        if '1D' in node_class:
+            self._generate_pointwise_conv1d(node)
+        else:
+            raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})')
+
+    def _generate_pointwise_conv1d(self, node):
+        code_str = node.model.config.backend.generate_pointwise_conv1d_fn(
+            node.get_attr('index'),
+            node.get_attr('reuse_factor'),
+        )
+
+        node.set_attr('pointwise_conv1d_codegen', Source(code_str))
diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index 874349aab3..a4fbdd405f 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -56,6 +56,8 @@
     typedef {config_t} mult_config;
     template<unsigned K, unsigned S, unsigned W>
     using scale_index = nnet::{scale_index_type}<K, S, W>;
+    template<class data_T, class res_T, class CONFIG_T>
+    using pointwise_conv = nnet::{pointwise_fn}<data_T, res_T, CONFIG_T>;
 }};
 const ap_uint<config{index}::filt_width> config{index}::pixels[] = {{{instructions}}};\n"""
 
@@ -89,6 +91,11 @@ def format(self, node):
         else:
             params['fill_fn'] = 'FillConv1DBuffer'
 
+        if node.get_attr('filt_width') == 1 and node.model.config.get_config_value('IOType') == 'io_parallel':
+            params['pointwise_fn'] = f'pointwise_conv_{node.index}'
+        else:
+            params['pointwise_fn'] = 'PointwiseConv1D'
+
         conv_config = self.template.format(**params)
 
         mult_params = self._default_config_params(node)
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
index e4db43682e..32fa7321c5 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
@@ -25,6 +25,96 @@ template <class data_T, typename CONFIG_T> class FillConv2DBuffer {
     }
 };
 
+template <class data_T, class res_T, typename CONFIG_T> class PointwiseConv1D {
+  public:
+    static void pointwise_conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                               res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                               typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                               typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        // To be implemented in subclasses
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor],
+                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor],
+                                  typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt];
+
+    #pragma HLS ARRAY_PARTITION variable=mult complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=weights,biases
+
+    // Parallel mode
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    #pragma HLS ARRAY_PARTITION variable=weights complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
+
+    // Limit multipliers to control parallelization
+    int multiplier_limit =
+        ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) /
+             float(CONFIG_T::reuse_factor));
+#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
+
+// Convolve, saving all multiplication results to accumulate later
+ConvOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    ConvFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        ConvChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                #pragma HLS UNROLL
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                int index_weight = cc * CONFIG_T::n_filt + ff;
+                int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
+
+                if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left ||
+                    (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
+                    mult[index_mult] = 0;
+                } else {
+                    mult[index_mult] = data[index_data] * weights[index_weight];
+                }
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Initialize accumulator with input biases
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
+            acc[ii][ff] = biases[ff];
+        }
+    }
+
+// Accumulate multiplication result
+AccumOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    AccumFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Do "dot product" sum within filter and sum over channels
+        AccumChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                acc[ii][ff] += mult[index_mult];
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Cast to "res_t" type
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
+            res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]);
+        }
+    }
+}
+
 // hls4ml insert code
 
 } // namespace nnet
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
index 0f2e89ac8f..7cceabfe1b 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
@@ -56,8 +56,8 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     if (CONFIG_T::strategy == nnet::latency) {
         if (CONFIG_T::implementation == conv_implementation::pointwise) {
             // Use pointwise unrolled implementation
-            if (CONFIG_T::reuse_factor > 1 && CONFIG_T::reuse_factor <= 120) {
-                pointwise_conv_1d_latency_cl_split_by_rf<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+            if (CONFIG_T::reuse_factor > 1) {
+                CONFIG_T::template pointwise_conv<data_T, res_T, CONFIG_T>::pointwise_conv(data, res, weights, biases);
             } else {
                 assert(CONFIG_T::reuse_factor == 1);
                 pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index aabc869823..0d9afb10cb 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -84,356 +84,5 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor],
-                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor],
-                                  typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::filt_width == 1);
-
-    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
-    typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt];
-
-    #pragma HLS ARRAY_PARTITION variable=mult complete dim=0
-    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
-
-    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
-    #pragma HLS function_instantiate variable=weights,biases
-
-    // Parallel mode
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-    #pragma HLS ARRAY_PARTITION variable=weights complete dim=0
-    #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
-
-    // Limit multipliers to control parallelization
-    int multiplier_limit =
-        ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) /
-             float(CONFIG_T::reuse_factor));
-#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
-
-// Convolve, saving all multiplication results to accumulate later
-ConvOut:
-    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
-    ConvFilt:
-        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-        ConvChan:
-            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
-                #pragma HLS UNROLL
-                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
-                int index_weight = cc * CONFIG_T::n_filt + ff;
-                int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
-
-                if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left ||
-                    (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
-                    mult[index_mult] = 0;
-                } else {
-                    mult[index_mult] = data[index_data] * weights[index_weight];
-                }
-            } // end channel loop
-        }     // end filter loop
-    }         // end output loop
-
-    // Initialize accumulator with input biases
-    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
-        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-            #pragma HLS UNROLL
-            acc[ii][ff] = biases[ff];
-        }
-    }
-
-// Accumulate multiplication result
-AccumOut:
-    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
-    AccumFilt:
-        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-        // Do "dot product" sum within filter and sum over channels
-        AccumChan:
-            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
-                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
-                acc[ii][ff] += mult[index_mult];
-            } // end channel loop
-        }     // end filter loop
-    }         // end output loop
-
-    // Cast to "res_t" type
-    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
-        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-            #pragma HLS UNROLL
-            res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]);
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_latency_cl_split_by_rf(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-                                              res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                                              typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                                              typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-
-    data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
-    #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0
-    res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];
-    #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0
-
-RFInputLoop:
-    for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {
-    #pragma HLS UNROLL
-    InnerInputLoop:
-        for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {
-            #pragma HLS UNROLL
-            data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];
-        }
-    }
-
-    pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[0], res_tmp[0], weights, biases);
-    pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[1], res_tmp[1], weights, biases);
-    if (CONFIG_T::reuse_factor > 2)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[2], res_tmp[2], weights, biases);
-    if (CONFIG_T::reuse_factor > 3)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[3], res_tmp[3], weights, biases);
-    if (CONFIG_T::reuse_factor > 4)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[4], res_tmp[4], weights, biases);
-    if (CONFIG_T::reuse_factor > 5)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[5], res_tmp[5], weights, biases);
-    if (CONFIG_T::reuse_factor > 6)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[6], res_tmp[6], weights, biases);
-    if (CONFIG_T::reuse_factor > 7)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[7], res_tmp[7], weights, biases);
-    if (CONFIG_T::reuse_factor > 8)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[8], res_tmp[8], weights, biases);
-    if (CONFIG_T::reuse_factor > 9)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[9], res_tmp[9], weights, biases);
-    if (CONFIG_T::reuse_factor > 10)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[10], res_tmp[10], weights, biases);
-    if (CONFIG_T::reuse_factor > 11)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[11], res_tmp[11], weights, biases);
-    if (CONFIG_T::reuse_factor > 12)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[12], res_tmp[12], weights, biases);
-    if (CONFIG_T::reuse_factor > 13)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[13], res_tmp[13], weights, biases);
-    if (CONFIG_T::reuse_factor > 14)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[14], res_tmp[14], weights, biases);
-    if (CONFIG_T::reuse_factor > 15)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[15], res_tmp[15], weights, biases);
-    if (CONFIG_T::reuse_factor > 16)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[16], res_tmp[16], weights, biases);
-    if (CONFIG_T::reuse_factor > 17)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[17], res_tmp[17], weights, biases);
-    if (CONFIG_T::reuse_factor > 18)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[18], res_tmp[18], weights, biases);
-    if (CONFIG_T::reuse_factor > 19)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[19], res_tmp[19], weights, biases);
-    if (CONFIG_T::reuse_factor > 20)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[20], res_tmp[20], weights, biases);
-    if (CONFIG_T::reuse_factor > 21)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[21], res_tmp[21], weights, biases);
-    if (CONFIG_T::reuse_factor > 22)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[22], res_tmp[22], weights, biases);
-    if (CONFIG_T::reuse_factor > 23)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[23], res_tmp[23], weights, biases);
-    if (CONFIG_T::reuse_factor > 24)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[24], res_tmp[24], weights, biases);
-    if (CONFIG_T::reuse_factor > 25)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[25], res_tmp[25], weights, biases);
-    if (CONFIG_T::reuse_factor > 26)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[26], res_tmp[26], weights, biases);
-    if (CONFIG_T::reuse_factor > 27)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[27], res_tmp[27], weights, biases);
-    if (CONFIG_T::reuse_factor > 28)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[28], res_tmp[28], weights, biases);
-    if (CONFIG_T::reuse_factor > 29)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[29], res_tmp[29], weights, biases);
-    if (CONFIG_T::reuse_factor > 30)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[30], res_tmp[30], weights, biases);
-    if (CONFIG_T::reuse_factor > 31)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[31], res_tmp[31], weights, biases);
-    if (CONFIG_T::reuse_factor > 32)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[32], res_tmp[32], weights, biases);
-    if (CONFIG_T::reuse_factor > 33)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[33], res_tmp[33], weights, biases);
-    if (CONFIG_T::reuse_factor > 34)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[34], res_tmp[34], weights, biases);
-    if (CONFIG_T::reuse_factor > 35)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[35], res_tmp[35], weights, biases);
-    if (CONFIG_T::reuse_factor > 36)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[36], res_tmp[36], weights, biases);
-    if (CONFIG_T::reuse_factor > 37)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[37], res_tmp[37], weights, biases);
-    if (CONFIG_T::reuse_factor > 38)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[38], res_tmp[38], weights, biases);
-    if (CONFIG_T::reuse_factor > 39)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[39], res_tmp[39], weights, biases);
-    if (CONFIG_T::reuse_factor > 40)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[40], res_tmp[40], weights, biases);
-    if (CONFIG_T::reuse_factor > 41)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[41], res_tmp[41], weights, biases);
-    if (CONFIG_T::reuse_factor > 42)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[42], res_tmp[42], weights, biases);
-    if (CONFIG_T::reuse_factor > 43)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[43], res_tmp[43], weights, biases);
-    if (CONFIG_T::reuse_factor > 44)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[44], res_tmp[44], weights, biases);
-    if (CONFIG_T::reuse_factor > 45)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[45], res_tmp[45], weights, biases);
-    if (CONFIG_T::reuse_factor > 46)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[46], res_tmp[45], weights, biases);
-    if (CONFIG_T::reuse_factor > 47)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[47], res_tmp[47], weights, biases);
-    if (CONFIG_T::reuse_factor > 48)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[48], res_tmp[48], weights, biases);
-    if (CONFIG_T::reuse_factor > 49)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[49], res_tmp[49], weights, biases);
-    if (CONFIG_T::reuse_factor > 50)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[50], res_tmp[50], weights, biases);
-    if (CONFIG_T::reuse_factor > 51)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[51], res_tmp[51], weights, biases);
-    if (CONFIG_T::reuse_factor > 52)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[52], res_tmp[52], weights, biases);
-    if (CONFIG_T::reuse_factor > 53)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[53], res_tmp[53], weights, biases);
-    if (CONFIG_T::reuse_factor > 54)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[54], res_tmp[54], weights, biases);
-    if (CONFIG_T::reuse_factor > 55)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[55], res_tmp[55], weights, biases);
-    if (CONFIG_T::reuse_factor > 56)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[56], res_tmp[55], weights, biases);
-    if (CONFIG_T::reuse_factor > 57)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[57], res_tmp[57], weights, biases);
-    if (CONFIG_T::reuse_factor > 58)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[58], res_tmp[58], weights, biases);
-    if (CONFIG_T::reuse_factor > 59)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[59], res_tmp[59], weights, biases);
-    if (CONFIG_T::reuse_factor > 60)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[60], res_tmp[60], weights, biases);
-    if (CONFIG_T::reuse_factor > 61)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[61], res_tmp[61], weights, biases);
-    if (CONFIG_T::reuse_factor > 62)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[62], res_tmp[62], weights, biases);
-    if (CONFIG_T::reuse_factor > 63)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[63], res_tmp[63], weights, biases);
-    if (CONFIG_T::reuse_factor > 64)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[64], res_tmp[64], weights, biases);
-    if (CONFIG_T::reuse_factor > 65)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[65], res_tmp[65], weights, biases);
-    if (CONFIG_T::reuse_factor > 66)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[66], res_tmp[66], weights, biases);
-    if (CONFIG_T::reuse_factor > 67)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[67], res_tmp[67], weights, biases);
-    if (CONFIG_T::reuse_factor > 68)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[68], res_tmp[68], weights, biases);
-    if (CONFIG_T::reuse_factor > 69)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[69], res_tmp[69], weights, biases);
-    if (CONFIG_T::reuse_factor > 70)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[70], res_tmp[70], weights, biases);
-    if (CONFIG_T::reuse_factor > 71)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[71], res_tmp[71], weights, biases);
-    if (CONFIG_T::reuse_factor > 72)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[72], res_tmp[72], weights, biases);
-    if (CONFIG_T::reuse_factor > 73)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[73], res_tmp[73], weights, biases);
-    if (CONFIG_T::reuse_factor > 74)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[74], res_tmp[74], weights, biases);
-    if (CONFIG_T::reuse_factor > 75)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[75], res_tmp[75], weights, biases);
-    if (CONFIG_T::reuse_factor > 76)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[76], res_tmp[76], weights, biases);
-    if (CONFIG_T::reuse_factor > 77)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[77], res_tmp[77], weights, biases);
-    if (CONFIG_T::reuse_factor > 78)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[78], res_tmp[78], weights, biases);
-    if (CONFIG_T::reuse_factor > 79)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[79], res_tmp[79], weights, biases);
-    if (CONFIG_T::reuse_factor > 80)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[80], res_tmp[80], weights, biases);
-    if (CONFIG_T::reuse_factor > 81)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[81], res_tmp[81], weights, biases);
-    if (CONFIG_T::reuse_factor > 82)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[82], res_tmp[82], weights, biases);
-    if (CONFIG_T::reuse_factor > 83)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[83], res_tmp[83], weights, biases);
-    if (CONFIG_T::reuse_factor > 84)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[84], res_tmp[84], weights, biases);
-    if (CONFIG_T::reuse_factor > 85)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[85], res_tmp[85], weights, biases);
-    if (CONFIG_T::reuse_factor > 86)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[86], res_tmp[86], weights, biases);
-    if (CONFIG_T::reuse_factor > 87)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[87], res_tmp[87], weights, biases);
-    if (CONFIG_T::reuse_factor > 88)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[88], res_tmp[88], weights, biases);
-    if (CONFIG_T::reuse_factor > 89)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[89], res_tmp[89], weights, biases);
-    if (CONFIG_T::reuse_factor > 90)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[90], res_tmp[90], weights, biases);
-    if (CONFIG_T::reuse_factor > 91)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[91], res_tmp[91], weights, biases);
-    if (CONFIG_T::reuse_factor > 92)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[92], res_tmp[92], weights, biases);
-    if (CONFIG_T::reuse_factor > 93)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[93], res_tmp[93], weights, biases);
-    if (CONFIG_T::reuse_factor > 94)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[94], res_tmp[94], weights, biases);
-    if (CONFIG_T::reuse_factor > 95)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[95], res_tmp[95], weights, biases);
-    if (CONFIG_T::reuse_factor > 96)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[96], res_tmp[96], weights, biases);
-    if (CONFIG_T::reuse_factor > 97)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[97], res_tmp[97], weights, biases);
-    if (CONFIG_T::reuse_factor > 98)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[98], res_tmp[98], weights, biases);
-    if (CONFIG_T::reuse_factor > 99)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[99], res_tmp[99], weights, biases);
-    if (CONFIG_T::reuse_factor > 100)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[100], res_tmp[100], weights, biases);
-    if (CONFIG_T::reuse_factor > 101)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[101], res_tmp[101], weights, biases);
-    if (CONFIG_T::reuse_factor > 102)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[102], res_tmp[102], weights, biases);
-    if (CONFIG_T::reuse_factor > 103)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[103], res_tmp[103], weights, biases);
-    if (CONFIG_T::reuse_factor > 104)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[104], res_tmp[104], weights, biases);
-    if (CONFIG_T::reuse_factor > 105)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[105], res_tmp[105], weights, biases);
-    if (CONFIG_T::reuse_factor > 106)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[106], res_tmp[106], weights, biases);
-    if (CONFIG_T::reuse_factor > 107)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[107], res_tmp[107], weights, biases);
-    if (CONFIG_T::reuse_factor > 108)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[108], res_tmp[108], weights, biases);
-    if (CONFIG_T::reuse_factor > 109)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[109], res_tmp[109], weights, biases);
-    if (CONFIG_T::reuse_factor > 110)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[110], res_tmp[110], weights, biases);
-    if (CONFIG_T::reuse_factor > 111)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[111], res_tmp[111], weights, biases);
-    if (CONFIG_T::reuse_factor > 112)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[112], res_tmp[112], weights, biases);
-    if (CONFIG_T::reuse_factor > 113)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[113], res_tmp[113], weights, biases);
-    if (CONFIG_T::reuse_factor > 114)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[114], res_tmp[114], weights, biases);
-    if (CONFIG_T::reuse_factor > 115)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[115], res_tmp[115], weights, biases);
-    if (CONFIG_T::reuse_factor > 116)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[116], res_tmp[116], weights, biases);
-    if (CONFIG_T::reuse_factor > 117)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[117], res_tmp[117], weights, biases);
-    if (CONFIG_T::reuse_factor > 118)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[118], res_tmp[118], weights, biases);
-    if (CONFIG_T::reuse_factor > 119)
-        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[119], res_tmp[119], weights, biases);
-
-RFOutputLoop:
-    for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {
-    #pragma HLS UNROLL
-    InnerOutputLoop:
-        for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {
-            #pragma HLS UNROLL
-            res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];
-        }
-    }
-}
-
 } // namespace nnet
 #endif
diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py
index cbe2036712..a7ad3437b2 100644
--- a/test/pytest/test_pointwiseconv.py
+++ b/test/pytest/test_pointwiseconv.py
@@ -15,11 +15,13 @@
 strides1d_options = [(1,), (2,)]
 strides2d_options = [(1, 1), (2, 2)]
 strategy_options = ['Latency', 'Resource']
+rf_options = [1, 2]
 
 
 @pytest.mark.parametrize('chans', chans_options)
 @pytest.mark.parametrize('padds', padds_options)
 @pytest.mark.parametrize('strides', strides1d_options)
+@pytest.mark.parametrize('rf', rf_options)
 @pytest.mark.parametrize(
     'backend, io_type, strategy, conv_impl',
     [
@@ -36,7 +38,7 @@
         ('Vitis', 'io_stream', 'resource', 'LineBuffer'),
     ],
 )
-def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_impl):
+def test_pointwiseconv1d(chans, padds, strides, rf, backend, io_type, strategy, conv_impl):
     model = tf.keras.models.Sequential()
     input_shape = (28, 3)
     model.add(
@@ -61,10 +63,11 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv
     config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision, granularity='name')
     config['Model']['Strategy'] = strategy
     config['LayerName']['pointwise1d']['ConvImplementation'] = conv_impl
+    config['LayerName']['pointwise1d']['ReuseFactor'] = rf
 
     output_dir = str(
         test_root_path
-        / f'hls4mlprj_pointwise1d_{chans}_strides_{strides[0]}_{padds}_padding_{backend}_{io_type}_{strategy}_{conv_impl}'
+        / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{rf}_{backend}_{io_type}_{strategy}_{conv_impl}'
     )
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend

From 30c5c70f649553ab11611f6b02f8ab84bd86e801 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sun, 8 Oct 2023 05:40:04 -0700
Subject: [PATCH 12/41] fix indent

---
 hls4ml/backends/fpga/fpga_backend.py | 57 ++++++++++++++--------------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
index 349a5ddbc8..35151af348 100644
--- a/hls4ml/backends/fpga/fpga_backend.py
+++ b/hls4ml/backends/fpga/fpga_backend.py
@@ -874,44 +874,45 @@ def generate_pointwise_conv1d_fn(self, layer_idx, reuse_factor=1):
         generated_code = (
             "template<class data_T, class res_T, typename CONFIG_T>\n"
             "class pointwise_conv_{index} : public PointwiseConv1D<data_T, res_T, CONFIG_T> {{\n"
-            "    public:\n"
+            "  public:\n"
             "    static void pointwise_conv(\n"
-            "    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n"
-            "    res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n"
-            "    typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n"
-            "    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n"
-            "    data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n"
-            "    #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n"
-            "    res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n"
-            "    #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n"
-            "RFInputLoop:\n"
-            "    for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n"
-            "    #pragma HLS UNROLL\n"
-            "    InnerInputLoop:\n"
-            "        for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n"
-            "            #pragma HLS UNROLL\n"
-            "            data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];"
-            "\n"
-            "        }}\n"
-            "    }}\n\n"
+            "                               data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n"
+            "                               res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n"
+            "                               typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n"
+            "                               typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n"
+            "        data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n"  # noqa: E501
+            "        #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n"
+            "        res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n"  # noqa: E501
+            "        #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n"
+            "    RFInputLoop:\n"
+            "        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n"
+            "        #pragma HLS UNROLL\n"
+            "        InnerInputLoop:\n"
+            "            for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n"
+            "                #pragma HLS UNROLL\n"
+            "                data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n"  # noqa: E501
+            "            }}\n"
+            "        }}\n\n"
         ).format(index=layer_idx)
+        indent = "        "
         for i in range(reuse_factor):
+            generated_code += indent
             generated_code += (
-                f"    pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[{i}], res_tmp[{i}], weights, biases);\n"
+                f"pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[{i}], res_tmp[{i}], weights, biases);\n"
             )
 
         generated_code += (
             "\n"
-            "RFOutputLoop:\n"
-            "    for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n"
-            "    #pragma HLS UNROLL\n"
-            "    InnerOutputLoop:\n"
-            "        for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n"
-            "            #pragma HLS UNROLL\n"
-            "            res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n"
+            "    RFOutputLoop:\n"
+            "        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n"
+            "        #pragma HLS UNROLL\n"
+            "        InnerOutputLoop:\n"
+            "            for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n"
+            "                #pragma HLS UNROLL\n"
+            "                res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n"  # noqa: E501
+            "            }\n"
             "        }\n"
             "    }\n"
-            "}\n"
             "};\n"
         )
 

From a05bf69ebc99d7ce448db3f89398d615a52fe369 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Mon, 9 Oct 2023 13:28:57 -0700
Subject: [PATCH 13/41] update rf

---
 test/pytest/test_pointwiseconv.py | 32 +++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py
index a7ad3437b2..79fce34103 100644
--- a/test/pytest/test_pointwiseconv.py
+++ b/test/pytest/test_pointwiseconv.py
@@ -15,30 +15,30 @@
 strides1d_options = [(1,), (2,)]
 strides2d_options = [(1, 1), (2, 2)]
 strategy_options = ['Latency', 'Resource']
-rf_options = [1, 2]
 
 
 @pytest.mark.parametrize('chans', chans_options)
 @pytest.mark.parametrize('padds', padds_options)
 @pytest.mark.parametrize('strides', strides1d_options)
-@pytest.mark.parametrize('rf', rf_options)
 @pytest.mark.parametrize(
-    'backend, io_type, strategy, conv_impl',
+    'backend, io_type, strategy, conv_impl, rf',
     [
-        ('Quartus', 'io_parallel', 'resource', 'LineBuffer'),
-        ('Vivado', 'io_parallel', 'resource', 'LineBuffer'),
-        ('Vitis', 'io_parallel', 'resource', 'LineBuffer'),
-        ('Vivado', 'io_parallel', 'latency', 'LineBuffer'),
-        ('Vitis', 'io_parallel', 'latency', 'LineBuffer'),
-        ('Vivado', 'io_parallel', 'latency', 'Pointwise'),
-        ('Vitis', 'io_parallel', 'latency', 'Pointwise'),
-        ('Vivado', 'io_stream', 'latency', 'LineBuffer'),
-        ('Vivado', 'io_stream', 'resource', 'LineBuffer'),
-        ('Vitis', 'io_stream', 'latency', 'LineBuffer'),
-        ('Vitis', 'io_stream', 'resource', 'LineBuffer'),
+        ('Quartus', 'io_parallel', 'resource', 'LineBuffer', 1),
+        ('Vivado', 'io_parallel', 'resource', 'LineBuffer', 1),
+        ('Vitis', 'io_parallel', 'resource', 'LineBuffer', 1),
+        ('Vivado', 'io_parallel', 'latency', 'LineBuffer', 1),
+        ('Vitis', 'io_parallel', 'latency', 'LineBuffer', 1),
+        ('Vivado', 'io_parallel', 'latency', 'Pointwise', 1),
+        ('Vivado', 'io_parallel', 'latency', 'Pointwise', 14),
+        ('Vitis', 'io_parallel', 'latency', 'Pointwise', 1),
+        ('Vitis', 'io_parallel', 'latency', 'Pointwise', 14),
+        ('Vivado', 'io_stream', 'latency', 'LineBuffer', 1),
+        ('Vivado', 'io_stream', 'resource', 'LineBuffer', 1),
+        ('Vitis', 'io_stream', 'latency', 'LineBuffer', 1),
+        ('Vitis', 'io_stream', 'resource', 'LineBuffer', 1),
     ],
 )
-def test_pointwiseconv1d(chans, padds, strides, rf, backend, io_type, strategy, conv_impl):
+def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_impl, rf):
     model = tf.keras.models.Sequential()
     input_shape = (28, 3)
     model.add(
@@ -67,7 +67,7 @@ def test_pointwiseconv1d(chans, padds, strides, rf, backend, io_type, strategy,
 
     output_dir = str(
         test_root_path
-        / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{rf}_{backend}_{io_type}_{strategy}_{conv_impl}'
+        / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{backend}_{io_type}_{strategy}_{conv_impl}_rf{rf}'
     )
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend

From 445b2cd8744d3ba7928a69a1f556fe5c82c0e6d8 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Wed, 11 Oct 2023 19:58:42 -0700
Subject: [PATCH 14/41] address vlad comments part 1

---
 hls4ml/backends/fpga/fpga_backend.py          | 58 -------------
 .../passes/{codegen.py => im2col_codegen.py}  |  0
 .../vivado/passes/pointwise_codegen.py        | 25 ++++++
 hls4ml/backends/vivado/vivado_backend.py      | 58 +++++++++++++
 hls4ml/templates/vivado/build_prj.tcl         |  2 +-
 .../vivado/nnet_utils/nnet_code_gen.h         | 81 +------------------
 .../templates/vivado/nnet_utils/nnet_common.h |  1 +
 .../vivado/nnet_utils/nnet_conv1d_latency.h   | 80 ++++++++++++++++++
 hls4ml/writer/vivado_writer.py                |  2 +
 9 files changed, 168 insertions(+), 139 deletions(-)
 rename hls4ml/backends/fpga/passes/{codegen.py => im2col_codegen.py} (100%)
 create mode 100644 hls4ml/backends/vivado/passes/pointwise_codegen.py

diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
index 35151af348..8cfaec8b3f 100644
--- a/hls4ml/backends/fpga/fpga_backend.py
+++ b/hls4ml/backends/fpga/fpga_backend.py
@@ -860,64 +860,6 @@ def generate_conv2d_line_buffer_fn(
 
         return generated_code
 
-    def generate_pointwise_conv1d_fn(self, layer_idx, reuse_factor=1):
-        """Generate a C++ function for a pointwise convolution layer.
-
-        Args:
-            layer_idx (int): Index of layer ('index' attribute).
-            reuse_factor (int): Number of partitions to divide the input into.
-
-        Returns:
-            str: Generated C++ function
-        """
-
-        generated_code = (
-            "template<class data_T, class res_T, typename CONFIG_T>\n"
-            "class pointwise_conv_{index} : public PointwiseConv1D<data_T, res_T, CONFIG_T> {{\n"
-            "  public:\n"
-            "    static void pointwise_conv(\n"
-            "                               data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n"
-            "                               res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n"
-            "                               typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n"
-            "                               typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n"
-            "        data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n"  # noqa: E501
-            "        #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n"
-            "        res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n"  # noqa: E501
-            "        #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n"
-            "    RFInputLoop:\n"
-            "        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n"
-            "        #pragma HLS UNROLL\n"
-            "        InnerInputLoop:\n"
-            "            for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n"
-            "                #pragma HLS UNROLL\n"
-            "                data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n"  # noqa: E501
-            "            }}\n"
-            "        }}\n\n"
-        ).format(index=layer_idx)
-        indent = "        "
-        for i in range(reuse_factor):
-            generated_code += indent
-            generated_code += (
-                f"pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[{i}], res_tmp[{i}], weights, biases);\n"
-            )
-
-        generated_code += (
-            "\n"
-            "    RFOutputLoop:\n"
-            "        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n"
-            "        #pragma HLS UNROLL\n"
-            "        InnerOutputLoop:\n"
-            "            for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n"
-            "                #pragma HLS UNROLL\n"
-            "                res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n"  # noqa: E501
-            "            }\n"
-            "        }\n"
-            "    }\n"
-            "};\n"
-        )
-
-        return generated_code
-
     @model_optimizer()
     def write_hls(self, model):
         self.writer.write_hls(model)
diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/im2col_codegen.py
similarity index 100%
rename from hls4ml/backends/fpga/passes/codegen.py
rename to hls4ml/backends/fpga/passes/im2col_codegen.py
diff --git a/hls4ml/backends/vivado/passes/pointwise_codegen.py b/hls4ml/backends/vivado/passes/pointwise_codegen.py
new file mode 100644
index 0000000000..f459d59208
--- /dev/null
+++ b/hls4ml/backends/vivado/passes/pointwise_codegen.py
@@ -0,0 +1,25 @@
+from hls4ml.model.layers import Conv1D
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.types import Source
+
+
+class GeneratePointwiseConv1D(OptimizerPass):
+    '''Generates code for pointwise 1D convolution'''
+
+    def match(self, node):
+        return isinstance(node, Conv1D) and node.model.config.get_config_value('IOType') == 'io_parallel'
+
+    def transform(self, model, node):
+        node_class = node.__class__.__name__
+        if '1D' in node_class:
+            self._generate_pointwise_conv1d(node)
+        else:
+            raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})')
+
+    def _generate_pointwise_conv1d(self, node):
+        code_str = node.model.config.backend.generate_pointwise_conv1d_fn(
+            node.get_attr('index'),
+            node.get_attr('reuse_factor'),
+        )
+
+        node.set_attr('pointwise_conv1d_codegen', Source(code_str))
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 011d576f64..8db278be9b 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -474,3 +474,61 @@ def init_garnet(self, layer):
     @layer_optimizer(GarNetStack)
     def init_garnet_stack(self, layer):
         self.init_garnet(layer)
+
+    def generate_pointwise_conv1d_fn(self, layer_idx, reuse_factor=1):
+        """Generate a C++ function for a pointwise convolution layer.
+
+        Args:
+            layer_idx (int): Index of layer ('index' attribute).
+            reuse_factor (int): Number of partitions to divide the input into.
+
+        Returns:
+            str: Generated C++ function
+        """
+
+        generated_code = (
+            "template<class data_T, class res_T, typename CONFIG_T>\n"
+            "class pointwise_conv_{index} : public PointwiseConv1D<data_T, res_T, CONFIG_T> {{\n"
+            "  public:\n"
+            "    static void pointwise_conv(\n"
+            "                               data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n"
+            "                               res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n"
+            "                               typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n"
+            "                               typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n"
+            "        data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n"  # noqa: E501
+            "        #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n"
+            "        res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n"  # noqa: E501
+            "        #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n"
+            "    RFInputLoop:\n"
+            "        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n"
+            "        #pragma HLS UNROLL\n"
+            "        InnerInputLoop:\n"
+            "            for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n"
+            "                #pragma HLS UNROLL\n"
+            "                data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n"  # noqa: E501
+            "            }}\n"
+            "        }}\n\n"
+        ).format(index=layer_idx)
+        indent = "        "
+        for i in range(reuse_factor):
+            generated_code += indent
+            generated_code += (
+                f"pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[{i}], res_tmp[{i}], weights, biases);\n"
+            )
+
+        generated_code += (
+            "\n"
+            "    RFOutputLoop:\n"
+            "        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n"
+            "        #pragma HLS UNROLL\n"
+            "        InnerOutputLoop:\n"
+            "            for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n"
+            "                #pragma HLS UNROLL\n"
+            "                res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n"  # noqa: E501
+            "            }\n"
+            "        }\n"
+            "    }\n"
+            "};\n"
+        )
+
+        return generated_code
diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl
index 82b3c5a640..4ef8032d4f 100644
--- a/hls4ml/templates/vivado/build_prj.tcl
+++ b/hls4ml/templates/vivado/build_prj.tcl
@@ -161,7 +161,7 @@ if {$opt(reset)} {
 } else {
     open_solution "solution1"
 }
-catch {config_array_partition -maximum_size 8192}
+catch {config_array_partition -maximum_size $maximum_size}
 config_compile -name_max_length 80
 set_part $part
 config_schedule -enable_dsp_full_reg=false
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
index 32fa7321c5..1900aa2716 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
@@ -1,6 +1,7 @@
 #ifndef NNET_INSTR_GEN_H_
 #define NNET_INSTR_GEN_H_
 
+#include "nnet_conv1d_latency.h"
 #include "nnet_helpers.h"
 #include <iostream>
 
@@ -35,86 +36,6 @@ template <class data_T, class res_T, typename CONFIG_T> class PointwiseConv1D {
     }
 };
 
-template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor],
-                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor],
-                                  typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::filt_width == 1);
-
-    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
-    typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt];
-
-    #pragma HLS ARRAY_PARTITION variable=mult complete dim=0
-    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
-
-    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
-    #pragma HLS function_instantiate variable=weights,biases
-
-    // Parallel mode
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-    #pragma HLS ARRAY_PARTITION variable=weights complete dim=0
-    #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
-
-    // Limit multipliers to control parallelization
-    int multiplier_limit =
-        ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) /
-             float(CONFIG_T::reuse_factor));
-#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
-
-// Convolve, saving all multiplication results to accumulate later
-ConvOut:
-    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
-    ConvFilt:
-        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-        ConvChan:
-            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
-                #pragma HLS UNROLL
-                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
-                int index_weight = cc * CONFIG_T::n_filt + ff;
-                int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
-
-                if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left ||
-                    (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
-                    mult[index_mult] = 0;
-                } else {
-                    mult[index_mult] = data[index_data] * weights[index_weight];
-                }
-            } // end channel loop
-        }     // end filter loop
-    }         // end output loop
-
-    // Initialize accumulator with input biases
-    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
-        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-            #pragma HLS UNROLL
-            acc[ii][ff] = biases[ff];
-        }
-    }
-
-// Accumulate multiplication result
-AccumOut:
-    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
-    AccumFilt:
-        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-        // Do "dot product" sum within filter and sum over channels
-        AccumChan:
-            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
-                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
-                acc[ii][ff] += mult[index_mult];
-            } // end channel loop
-        }     // end filter loop
-    }         // end output loop
-
-    // Cast to "res_t" type
-    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
-        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-            #pragma HLS UNROLL
-            res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]);
-        }
-    }
-}
-
 // hls4ml insert code
 
 } // namespace nnet
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
index e942a1dc89..c3cf1a2de4 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
@@ -2,6 +2,7 @@
 #define NNET_COMMON_H_
 
 #include "ap_fixed.h"
+#include "nnet_helpers.h"
 
 // This is a substitute for "ceil(n/(float)d)".
 #define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index 0d9afb10cb..8fb9f769f4 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -84,5 +84,85 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     }
 }
 
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor],
+                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor],
+                                  typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt];
+
+    #pragma HLS ARRAY_PARTITION variable=mult complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=weights,biases
+
+    // Parallel mode
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    #pragma HLS ARRAY_PARTITION variable=weights complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
+
+    // Limit multipliers to control parallelization
+    int multiplier_limit =
+        ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) /
+             float(CONFIG_T::reuse_factor));
+#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
+
+// Convolve, saving all multiplication results to accumulate later
+ConvOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    ConvFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        ConvChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                #pragma HLS UNROLL
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                int index_weight = cc * CONFIG_T::n_filt + ff;
+                int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
+
+                if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left ||
+                    (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
+                    mult[index_mult] = 0;
+                } else {
+                    mult[index_mult] = data[index_data] * weights[index_weight];
+                }
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Initialize accumulator with input biases
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
+            acc[ii][ff] = biases[ff];
+        }
+    }
+
+// Accumulate multiplication result
+AccumOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    AccumFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Do "dot product" sum within filter and sum over channels
+        AccumChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                acc[ii][ff] += mult[index_mult];
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Cast to "res_t" type
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
+            res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]);
+        }
+    }
+}
+
 } // namespace nnet
 #endif
diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index 412bb8d667..2f7bb676f4 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -591,6 +591,8 @@ def write_build_script(self, model):
         f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%')))
         f.write('variable version\n')
         f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0')))
+        f.write('variable maximum_size\n')
+        f.write('set maximum_size {}\n'.format(model.config.get_config_value('MaximumSize', '4192')))
         f.close()
 
         # build_prj.tcl

From 1dd2603558f8ceb6d16b449c67e52567650d3eaf Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Wed, 11 Oct 2023 20:01:28 -0700
Subject: [PATCH 15/41] default 4096

---
 hls4ml/writer/vivado_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index 2f7bb676f4..80c4094a4f 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -592,7 +592,7 @@ def write_build_script(self, model):
         f.write('variable version\n')
         f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0')))
         f.write('variable maximum_size\n')
-        f.write('set maximum_size {}\n'.format(model.config.get_config_value('MaximumSize', '4192')))
+        f.write('set maximum_size {}\n'.format(model.config.get_config_value('MaximumSize', '4096')))
         f.close()
 
         # build_prj.tcl

From 04997c234ffed74b35ff79074d5c8b9c7788477f Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sat, 14 Oct 2023 21:27:51 -0700
Subject: [PATCH 16/41] only add pointwise function when optimizing

---
 hls4ml/backends/fpga/passes/im2col_codegen.py | 22 -----
 .../vivado/passes/convolution_templates.py    |  2 -
 hls4ml/backends/vivado/passes/pointwise.py    | 82 ++++++++++++++++++-
 .../vivado/nnet_utils/nnet_code_gen.h         | 10 +++
 4 files changed, 88 insertions(+), 28 deletions(-)

diff --git a/hls4ml/backends/fpga/passes/im2col_codegen.py b/hls4ml/backends/fpga/passes/im2col_codegen.py
index 6d7243dd8b..f1f1080996 100644
--- a/hls4ml/backends/fpga/passes/im2col_codegen.py
+++ b/hls4ml/backends/fpga/passes/im2col_codegen.py
@@ -49,25 +49,3 @@ def _generate_im2col_2d(self, node):
         )
 
         node.set_attr('line_buffer_codegen', Source(code_str))
-
-
-class GeneratePointwiseConv1D(OptimizerPass):
-    '''Generates code for pointwise 1D convolution'''
-
-    def match(self, node):
-        return isinstance(node, Conv1D) and node.model.config.get_config_value('IOType') == 'io_parallel'
-
-    def transform(self, model, node):
-        node_class = node.__class__.__name__
-        if '1D' in node_class:
-            self._generate_pointwise_conv1d(node)
-        else:
-            raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})')
-
-    def _generate_pointwise_conv1d(self, node):
-        code_str = node.model.config.backend.generate_pointwise_conv1d_fn(
-            node.get_attr('index'),
-            node.get_attr('reuse_factor'),
-        )
-
-        node.set_attr('pointwise_conv1d_codegen', Source(code_str))
diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index a4fbdd405f..60eddae806 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -56,8 +56,6 @@
     typedef {config_t} mult_config;
     template<unsigned K, unsigned S, unsigned W>
     using scale_index = nnet::{scale_index_type}<K, S, W>;
-    template<class data_T, class res_T, class CONFIG_T>
-    using pointwise_conv = nnet::{pointwise_fn}<data_T, res_T, CONFIG_T>;
 }};
 const ap_uint<config{index}::filt_width> config{index}::pixels[] = {{{instructions}}};\n"""
 
diff --git a/hls4ml/backends/vivado/passes/pointwise.py b/hls4ml/backends/vivado/passes/pointwise.py
index c353a10604..0353787e8c 100644
--- a/hls4ml/backends/vivado/passes/pointwise.py
+++ b/hls4ml/backends/vivado/passes/pointwise.py
@@ -8,13 +8,87 @@
     Conv1DFunctionTemplate,
     Conv2DConfigTemplate,
     Conv2DFunctionTemplate,
-    conv1d_config_template,
-    conv2d_config_template,
     conv_mult_config_template,
 )
 from hls4ml.model.layers import register_layer
 from hls4ml.model.optimizer import OptimizerPass
 
+pointwise_conv1d_config_template = """struct config{index} : nnet::conv1d_config {{
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+    static const unsigned in_width = {in_width};
+    static const unsigned n_chan = {n_chan};
+    static const unsigned filt_width = {filt_width};
+    static const unsigned kernel_size = filt_width;
+    static const unsigned n_filt = {n_filt};
+    static const unsigned stride_width = {stride_width};
+    static const unsigned dilation = {dilation};
+    static const unsigned out_width = {out_width};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned n_zeros = {nzeros};
+    static const bool store_weights_in_bram = false;
+    static const unsigned strategy = nnet::{strategy};
+    static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
+    static const unsigned min_width = {min_width};
+    static const ap_uint<filt_width> pixels[min_width];
+    static const unsigned n_partitions = {n_partitions};
+    static const unsigned n_pixels = out_width / n_partitions;
+    template<class data_T, class CONFIG_T>
+    using fill_buffer = nnet::{fill_fn}<data_T, CONFIG_T>;
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+    typedef {config_t} mult_config;
+    template<unsigned K, unsigned S, unsigned W>
+    using scale_index = nnet::{scale_index_type}<K, S, W>;
+    template<class data_T, class res_T, class CONFIG_T>
+    using pointwise_conv = nnet::{pointwise_fn}<data_T, res_T, CONFIG_T>;
+}};
+const ap_uint<config{index}::filt_width> config{index}::pixels[] = {{{instructions}}};\n"""
+
+pointwise_conv2d_config_template = """struct config{index} : nnet::conv2d_config {{
+    static const unsigned pad_top = {pad_top};
+    static const unsigned pad_bottom = {pad_bottom};
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+    static const unsigned in_height = {in_height};
+    static const unsigned in_width = {in_width};
+    static const unsigned n_chan = {n_chan};
+    static const unsigned filt_height = {filt_height};
+    static const unsigned filt_width = {filt_width};
+    static const unsigned kernel_size = filt_height * filt_width;
+    static const unsigned n_filt = {n_filt};
+    static const unsigned stride_height = {stride_height};
+    static const unsigned stride_width = {stride_width};
+    static const unsigned out_height = {out_height};
+    static const unsigned out_width = {out_width};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned n_zeros = {nzeros};
+    static const unsigned multiplier_limit =
+        DIV_ROUNDUP(kernel_size * n_chan * n_filt, reuse_factor) - n_zeros / reuse_factor;
+    static const bool store_weights_in_bram = false;
+    static const unsigned strategy = nnet::{strategy};
+    static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
+    static const unsigned min_height = {min_height};
+    static const unsigned min_width = {min_width};
+    static const ap_uint<filt_height * filt_width> pixels[min_height * min_width];
+    static const unsigned n_partitions = {n_partitions};
+    static const unsigned n_pixels = out_height * out_width / n_partitions;
+    template<class data_T, class CONFIG_T>
+    using fill_buffer = nnet::{fill_fn}<data_T, CONFIG_T>;
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+    typedef {config_t} mult_config;
+    template<unsigned K, unsigned S, unsigned W>
+    using scale_index_height = nnet::{scale_index_height_type}<K, S, W>;
+    template<unsigned K, unsigned S, unsigned W>
+    using scale_index_width = nnet::{scale_index_width_type}<K, S, W>;
+    template<class data_T, class res_T, class CONFIG_T>
+    using pointwise_conv = nnet::{pointwise_fn}<data_T, res_T, CONFIG_T>;
+}};
+const ap_uint<config{index}::filt_height * config{index}::filt_width> config{index}::pixels[] = {{{instructions}}};\n"""
+
 pointwise_conv1d_function_template = (
     'nnet::pointwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
 )
@@ -29,7 +103,7 @@
 class PointwiseConv1DConfigTemplate(Conv1DConfigTemplate):
     def __init__(self):
         super(Conv1DConfigTemplate, self).__init__(PointwiseConv1D)
-        self.template = conv1d_config_template
+        self.template = pointwise_conv1d_config_template
         self.mult_template = conv_mult_config_template
 
 
@@ -42,7 +116,7 @@ def __init__(self):
 class PointwiseConv2DConfigTemplate(Conv2DConfigTemplate):
     def __init__(self):
         super(Conv2DConfigTemplate, self).__init__(PointwiseConv2D)
-        self.template = conv2d_config_template
+        self.template = pointwise_conv2d_config_template
         self.mult_template = conv_mult_config_template
 
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
index 1900aa2716..1e922bbfed 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
@@ -36,6 +36,16 @@ template <class data_T, class res_T, typename CONFIG_T> class PointwiseConv1D {
     }
 };
 
+template <class data_T, class res_T, typename CONFIG_T> class PointwiseConv2D {
+  public:
+    static void pointwise_conv(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                               res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+                               typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                               typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        // To be implemented in subclasses
+    }
+};
+
 // hls4ml insert code
 
 } // namespace nnet

From a181d971b38a09aa4bd0d62e303d43f08474ca0f Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sun, 9 Jun 2024 21:35:40 -0700
Subject: [PATCH 17/41] add vitis

---
 .../templates/vitis/nnet_utils/nnet_conv1d.h  | 14 +++-
 .../vitis/nnet_utils/nnet_conv1d_latency.h    | 80 +++++++++++++++++++
 2 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
index 52a404672c..1b66c646af 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
@@ -55,9 +55,19 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
     //#pragma HLS INLINE recursive
 
-    // Nothing special to be done for io_parallel implementation
     if (CONFIG_T::strategy == nnet::latency) {
-        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        if (CONFIG_T::implementation == conv_implementation::pointwise) {
+            // Use pointwise unrolled implementation
+            if (CONFIG_T::reuse_factor > 1) {
+                CONFIG_T::template pointwise_conv<data_T, res_T, CONFIG_T>::pointwise_conv(data, res, weights, biases);
+            } else {
+                assert(CONFIG_T::reuse_factor == 1);
+                pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+            }
+        } else {
+            // Use standard unrolled implementation
+            conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        }
     } else {
         conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
index 1bf25cc89c..3fd6160f4f 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
@@ -85,5 +85,85 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     }
 }
 
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor],
+                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor],
+                                  typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt];
+
+    #pragma HLS ARRAY_PARTITION variable=mult complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=weights,biases
+
+    // Parallel mode
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    #pragma HLS ARRAY_PARTITION variable=weights complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
+
+    // Limit multipliers to control parallelization
+    int multiplier_limit =
+        ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) /
+             float(CONFIG_T::reuse_factor));
+#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
+
+// Convolve, saving all multiplication results to accumulate later
+ConvOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    ConvFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        ConvChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                #pragma HLS UNROLL
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                int index_weight = cc * CONFIG_T::n_filt + ff;
+                int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
+
+                if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left ||
+                    (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
+                    mult[index_mult] = 0;
+                } else {
+                    mult[index_mult] = data[index_data] * weights[index_weight];
+                }
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Initialize accumulator with input biases
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
+            acc[ii][ff] = biases[ff];
+        }
+    }
+
+// Accumulate multiplication result
+AccumOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    AccumFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Do "dot product" sum within filter and sum over channels
+        AccumChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                acc[ii][ff] += mult[index_mult];
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Cast to "res_t" type
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
+            res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]);
+        }
+    }
+}
+
 } // namespace nnet
 #endif

From a6a5c7f9a44848de88c59271f9d3298608c5bc4c Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Wed, 9 Oct 2024 18:52:55 -0700
Subject: [PATCH 18/41] add flow

---
 hls4ml/backends/vivado/vivado_backend.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 982fa2ce87..694cb503fe 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -115,6 +115,7 @@ def _register_flows(self):
             'vivado:generate_conv_streaming_instructions',
             'vivado:apply_resource_strategy',
             'vivado:generate_conv_im2col',
+            'vivado:generate_pointwise_conv1_d',
         ]
         vivado_types_flow = register_flow('specific_types', vivado_types, requires=[init_flow], backend=self.name)
 

From 170999fae963dba6bf4091a8af60f16b17dfb96a Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Thu, 10 Oct 2024 07:14:46 -0700
Subject: [PATCH 19/41] div roundup

---
 example-models                                           | 2 +-
 hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h  | 4 +---
 hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 4 +---
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/example-models b/example-models
index 3cfbcfd062..ff74f73dbc 160000
--- a/example-models
+++ b/example-models
@@ -1 +1 @@
-Subproject commit 3cfbcfd062f60492507d21ff0e91559b3bdd6550
+Subproject commit ff74f73dbc253d1aa7de1603ee10ede551919548
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
index 3fd6160f4f..bfe675ce12 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
@@ -107,9 +107,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
     #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
 
     // Limit multipliers to control parallelization
-    int multiplier_limit =
-        ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) /
-             float(CONFIG_T::reuse_factor));
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor);
 #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
 
 // Convolve, saving all multiplication results to accumulate later
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index 8fb9f769f4..6f23976799 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -106,9 +106,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
     #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
 
     // Limit multipliers to control parallelization
-    int multiplier_limit =
-        ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) /
-             float(CONFIG_T::reuse_factor));
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor);
 #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
 
 // Convolve, saving all multiplication results to accumulate later

From 4ec63876dbbd8643f195164f375882e329f27859 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Thu, 17 Oct 2024 13:50:27 -0700
Subject: [PATCH 20/41] update

---
 hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h  | 7 +------
 hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h | 7 +------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
index 1b66c646af..1c268ed588 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
@@ -58,12 +58,7 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     if (CONFIG_T::strategy == nnet::latency) {
         if (CONFIG_T::implementation == conv_implementation::pointwise) {
             // Use pointwise unrolled implementation
-            if (CONFIG_T::reuse_factor > 1) {
-                CONFIG_T::template pointwise_conv<data_T, res_T, CONFIG_T>::pointwise_conv(data, res, weights, biases);
-            } else {
-                assert(CONFIG_T::reuse_factor == 1);
-                pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-            }
+            CONFIG_T::template pointwise_conv<data_T, res_T, CONFIG_T>::pointwise_conv(data, res, weights, biases);
         } else {
             // Use standard unrolled implementation
             conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
index 7cceabfe1b..95d5d7fcce 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
@@ -56,12 +56,7 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     if (CONFIG_T::strategy == nnet::latency) {
         if (CONFIG_T::implementation == conv_implementation::pointwise) {
             // Use pointwise unrolled implementation
-            if (CONFIG_T::reuse_factor > 1) {
-                CONFIG_T::template pointwise_conv<data_T, res_T, CONFIG_T>::pointwise_conv(data, res, weights, biases);
-            } else {
-                assert(CONFIG_T::reuse_factor == 1);
-                pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-            }
+            CONFIG_T::template pointwise_conv<data_T, res_T, CONFIG_T>::pointwise_conv(data, res, weights, biases);
         } else {
             // Use standard unrolled implementation
             conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);

From 6ca2f1b381072e86eb3b5949315ec8e5a0b2a92a Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Mon, 28 Oct 2024 20:25:38 -0700
Subject: [PATCH 21/41] roundup

---
 hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h  | 3 ++-
 hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
index bfe675ce12..9102a038fd 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
@@ -107,7 +107,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
     #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
 
     // Limit multipliers to control parallelization
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor);
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(
+        CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor);
 #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
 
 // Convolve, saving all multiplication results to accumulate later
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index 6f23976799..2692f2912c 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -106,7 +106,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
     #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
 
     // Limit multipliers to control parallelization
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor);
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(
+        CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor);
 #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
 
 // Convolve, saving all multiplication results to accumulate later

From 352772d73386e915f94c756b761da9dc0154566f Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Mon, 28 Oct 2024 20:30:11 -0700
Subject: [PATCH 22/41] restore example-models

---
 example-models | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example-models b/example-models
index ff74f73dbc..3cfbcfd062 160000
--- a/example-models
+++ b/example-models
@@ -1 +1 @@
-Subproject commit ff74f73dbc253d1aa7de1603ee10ede551919548
+Subproject commit 3cfbcfd062f60492507d21ff0e91559b3bdd6550

From d37a843e77491416ff9d6cf9640cd13115324149 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Thu, 31 Oct 2024 21:17:23 -0700
Subject: [PATCH 23/41] remove pointwise conv implementation option; make it
 default

---
 hls4ml/backends/vivado/passes/pointwise.py    | 46 +------------------
 hls4ml/backends/vivado/vivado_backend.py      |  4 +-
 .../vivado/nnet_utils/nnet_code_gen.h         | 10 ----
 .../templates/vivado/nnet_utils/nnet_common.h |  2 -
 .../templates/vivado/nnet_utils/nnet_conv1d.h |  9 +---
 .../vivado/nnet_utils/nnet_conv_stream.h      |  4 +-
 test/pytest/test_pointwiseconv.py             | 41 ++++++++---------
 7 files changed, 27 insertions(+), 89 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/pointwise.py b/hls4ml/backends/vivado/passes/pointwise.py
index c671f1d670..79a72c1e6a 100644
--- a/hls4ml/backends/vivado/passes/pointwise.py
+++ b/hls4ml/backends/vivado/passes/pointwise.py
@@ -4,6 +4,7 @@
     Conv1DFunctionTemplate,
     Conv2DConfigTemplate,
     Conv2DFunctionTemplate,
+    conv2d_config_template,
     conv_mult_config_template,
 )
 from hls4ml.model.layers import register_layer
@@ -42,49 +43,6 @@
 }};
 const ap_uint<config{index}::filt_width> config{index}::pixels[] = {{{instructions}}};\n"""
 
-pointwise_conv2d_config_template = """struct config{index} : nnet::conv2d_config {{
-    static const unsigned pad_top = {pad_top};
-    static const unsigned pad_bottom = {pad_bottom};
-    static const unsigned pad_left = {pad_left};
-    static const unsigned pad_right = {pad_right};
-    static const unsigned in_height = {in_height};
-    static const unsigned in_width = {in_width};
-    static const unsigned n_chan = {n_chan};
-    static const unsigned filt_height = {filt_height};
-    static const unsigned filt_width = {filt_width};
-    static const unsigned kernel_size = filt_height * filt_width;
-    static const unsigned n_filt = {n_filt};
-    static const unsigned stride_height = {stride_height};
-    static const unsigned stride_width = {stride_width};
-    static const unsigned out_height = {out_height};
-    static const unsigned out_width = {out_width};
-    static const unsigned reuse_factor = {reuse};
-    static const unsigned n_zeros = {nzeros};
-    static const unsigned multiplier_limit =
-        DIV_ROUNDUP(kernel_size * n_chan * n_filt, reuse_factor) - n_zeros / reuse_factor;
-    static const bool store_weights_in_bram = false;
-    static const unsigned strategy = nnet::{strategy};
-    static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
-    static const unsigned min_height = {min_height};
-    static const unsigned min_width = {min_width};
-    static const ap_uint<filt_height * filt_width> pixels[min_height * min_width];
-    static const unsigned n_partitions = {n_partitions};
-    static const unsigned n_pixels = out_height * out_width / n_partitions;
-    template<class data_T, class CONFIG_T>
-    using fill_buffer = nnet::{fill_fn}<data_T, CONFIG_T>;
-    typedef {accum_t.name} accum_t;
-    typedef {bias_t.name} bias_t;
-    typedef {weight_t.name} weight_t;
-    typedef {config_t} mult_config;
-    template<unsigned K, unsigned S, unsigned W>
-    using scale_index_height = nnet::{scale_index_height_type}<K, S, W>;
-    template<unsigned K, unsigned S, unsigned W>
-    using scale_index_width = nnet::{scale_index_width_type}<K, S, W>;
-    template<class data_T, class res_T, class CONFIG_T>
-    using pointwise_conv = nnet::{pointwise_fn}<data_T, res_T, CONFIG_T>;
-}};
-const ap_uint<config{index}::filt_height * config{index}::filt_width> config{index}::pixels[] = {{{instructions}}};\n"""
-
 pointwise_conv1d_function_template = (
     'nnet::pointwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
 )
@@ -112,7 +70,7 @@ def __init__(self):
 class PointwiseConv2DConfigTemplate(Conv2DConfigTemplate):
     def __init__(self):
         super(Conv2DConfigTemplate, self).__init__(PointwiseConv2D)
-        self.template = pointwise_conv2d_config_template
+        self.template = conv2d_config_template
         self.mult_template = conv_mult_config_template
 
 
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index bea9b9ab35..8df4b86364 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -70,9 +70,7 @@ def _register_layer_attributes(self):
         cnn_layers = [Conv1D, Conv2D, SeparableConv1D, SeparableConv2D, DepthwiseConv2D, Pooling1D, Pooling2D]
         for layer in cnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(
-                ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded', 'Pointwise'], default='LineBuffer')
-            )
+            attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer'))
             self.attribute_map[layer] = attrs
 
     def _register_flows(self):
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
index 11152338e6..6011e20cca 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
@@ -21,16 +21,6 @@ template <class data_T, class res_T, typename CONFIG_T> class PointwiseConv1D {
     }
 };
 
-template <class data_T, class res_T, typename CONFIG_T> class PointwiseConv2D {
-  public:
-    static void pointwise_conv(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-                               res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
-                               typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                               typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-        // To be implemented in subclasses
-    }
-};
-
 // hls4ml insert code
 
 } // namespace nnet
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
index 8ce3d836fa..a14517df5b 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
@@ -2,7 +2,6 @@
 #define NNET_COMMON_H_
 
 #include "ap_fixed.h"
-#include "nnet_helpers.h"
 
 // This is a substitute for "ceil(n/(float)d)".
 #define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
@@ -25,7 +24,6 @@ namespace nnet {
 // Common type definitions
 enum io_type { io_parallel = 0, io_stream };
 enum strategy { latency, resource, resource_unrolled };
-enum class conv_implementation { linebuffer = 0, encoded = 1, pointwise = 2 };
 
 /* ---
  * Balanced tree reduce implementation.
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
index 95d5d7fcce..f0f1c133b9 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
@@ -54,13 +54,8 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     #pragma HLS INLINE region
 
     if (CONFIG_T::strategy == nnet::latency) {
-        if (CONFIG_T::implementation == conv_implementation::pointwise) {
-            // Use pointwise unrolled implementation
-            CONFIG_T::template pointwise_conv<data_T, res_T, CONFIG_T>::pointwise_conv(data, res, weights, biases);
-        } else {
-            // Use standard unrolled implementation
-            conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-        }
+        // Use pointwise unrolled implementation
+        CONFIG_T::template pointwise_conv<data_T, res_T, CONFIG_T>::pointwise_conv(data, res, weights, biases);
     } else {
         conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
index 4189fb3a09..0caa435717 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
@@ -8,6 +8,8 @@
 
 namespace nnet {
 
+enum class conv_implementation { linebuffer = 0, encoded = 1 };
+
 // *************************************************
 //       Encoded Implementation (Vlad's)
 // *************************************************
@@ -56,7 +58,7 @@ template <unsigned K, unsigned S, unsigned W> unsigned scale_index_K_lt_S(const
 template <unsigned K, unsigned S, unsigned W> class scale_index_regular {
   public:
     static unsigned scale_index(const unsigned idx) {
-        #pragma HLS INLINE
+#pragma HLS INLINE
 
         if (K >= S) {
             return scale_index_K_gte_S<K, S, W>(idx);
diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py
index 1bd9a73b01..2890b0ab11 100644
--- a/test/pytest/test_pointwiseconv.py
+++ b/test/pytest/test_pointwiseconv.py
@@ -19,29 +19,27 @@
 @pytest.mark.parametrize('padds', padds_options)
 @pytest.mark.parametrize('strides', strides1d_options)
 @pytest.mark.parametrize(
-    'backend, io_type, strategy, conv_impl, rf',
+    'backend, io_type, strategy, rf',
     [
-        ('Quartus', 'io_parallel', 'resource', 'LineBuffer', 1),
-        ('Quartus', 'io_stream', 'resource', 'LineBuffer', 1),
-        ('oneAPI', 'io_parallel', 'resource', 'LineBuffer', 1),
-        ('oneAPI', 'io_stream', 'resource', 'LineBuffer', 1),
-        ('Vivado', 'io_parallel', 'resource', 'LineBuffer', 1),
-        ('Vitis', 'io_parallel', 'resource', 'LineBuffer', 1),
-        ('Vivado', 'io_parallel', 'latency', 'LineBuffer', 1),
-        ('Vitis', 'io_parallel', 'latency', 'LineBuffer', 1),
-        ('Vivado', 'io_parallel', 'latency', 'Pointwise', 1),
-        ('Vivado', 'io_parallel', 'latency', 'Pointwise', 14),
-        ('Vitis', 'io_parallel', 'latency', 'Pointwise', 1),
-        ('Vitis', 'io_parallel', 'latency', 'Pointwise', 14),
-        ('Vivado', 'io_stream', 'latency', 'LineBuffer', 1),
-        ('Vivado', 'io_stream', 'resource', 'LineBuffer', 1),
-        ('Vitis', 'io_stream', 'latency', 'LineBuffer', 1),
-        ('Vitis', 'io_stream', 'resource', 'LineBuffer', 1),
-        ('Catapult', 'io_stream', 'latency', 'LineBuffer', 1),
-        ('Catapult', 'io_stream', 'resource', 'LineBuffer', 1),
+        ('Quartus', 'io_parallel', 'resource', 1),
+        ('Quartus', 'io_stream', 'resource', 1),
+        ('oneAPI', 'io_parallel', 'resource', 1),
+        ('oneAPI', 'io_stream', 'resource', 1),
+        ('Vivado', 'io_parallel', 'resource', 1),
+        ('Vitis', 'io_parallel', 'resource', 1),
+        ('Vivado', 'io_parallel', 'latency',  1),
+        ('Vitis', 'io_parallel', 'latency',  1),
+        ('Vivado', 'io_parallel', 'latency', 14),
+        ('Vitis', 'io_parallel', 'latency', 14),
+        ('Vivado', 'io_stream', 'latency', 1),
+        ('Vivado', 'io_stream', 'resource', 1),
+        ('Vitis', 'io_stream', 'latency', 1),
+        ('Vitis', 'io_stream', 'resource', 1),
+        ('Catapult', 'io_stream', 'latency', 1),
+        ('Catapult', 'io_stream', 'resource', 1),
     ],
 )
-def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv_impl, rf):
+def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, rf):
     model = tf.keras.models.Sequential()
     input_shape = (28, 3)
     model.add(
@@ -65,12 +63,11 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, conv
     default_precision = 'fixed<32,16>'
     config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision, granularity='name')
     config['Model']['Strategy'] = strategy
-    config['LayerName']['pointwise1d']['ConvImplementation'] = conv_impl
     config['LayerName']['pointwise1d']['ReuseFactor'] = rf
 
     output_dir = str(
         test_root_path
-        / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{backend}_{io_type}_{strategy}_{conv_impl}_rf{rf}'
+        / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{backend}_{io_type}_{strategy}_rf{rf}'
     )
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend

From f5629db5c55cb8b2944d4276d8a232178527ddd1 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Thu, 31 Oct 2024 21:19:45 -0700
Subject: [PATCH 24/41] remove pointwise conv implementation option; make it
 default

---
 hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
index 1c268ed588..92b8571d88 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
@@ -56,13 +56,8 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     //#pragma HLS INLINE recursive
 
     if (CONFIG_T::strategy == nnet::latency) {
-        if (CONFIG_T::implementation == conv_implementation::pointwise) {
-            // Use pointwise unrolled implementation
-            CONFIG_T::template pointwise_conv<data_T, res_T, CONFIG_T>::pointwise_conv(data, res, weights, biases);
-        } else {
-            // Use standard unrolled implementation
-            conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-        }
+        // Use pointwise unrolled implementation
+        CONFIG_T::template pointwise_conv<data_T, res_T, CONFIG_T>::pointwise_conv(data, res, weights, biases);
     } else {
         conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }

From f4ae08f9c2cbd561596ee34888591a5696bd18d6 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Thu, 31 Oct 2024 21:21:13 -0700
Subject: [PATCH 25/41] Restore tab

---
 hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
index 0caa435717..dcd914dffe 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
@@ -58,7 +58,7 @@ template <unsigned K, unsigned S, unsigned W> unsigned scale_index_K_lt_S(const
 template <unsigned K, unsigned S, unsigned W> class scale_index_regular {
   public:
     static unsigned scale_index(const unsigned idx) {
-#pragma HLS INLINE
+        #pragma HLS INLINE
 
         if (K >= S) {
             return scale_index_K_gte_S<K, S, W>(idx);

From ecd6b04aeee1ad341b9bfa939405571776664f6f Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Thu, 31 Oct 2024 21:27:57 -0700
Subject: [PATCH 26/41] Add back nnet_helpers.h

---
 hls4ml/templates/vivado/nnet_utils/nnet_common.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
index a14517df5b..6db3f62f6e 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
@@ -2,6 +2,7 @@
 #define NNET_COMMON_H_
 
 #include "ap_fixed.h"
+#include "nnet_helpers.h"
 
 // This is a substitute for "ceil(n/(float)d)".
 #define DIV_ROUNDUP(n, d) ((n + d - 1) / d)

From 6f5cbd98c52e541cf2f3609736cf25c0c0f0a34c Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Thu, 31 Oct 2024 21:29:56 -0700
Subject: [PATCH 27/41] format

---
 test/pytest/test_pointwiseconv.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py
index 2890b0ab11..1cfb43e4cd 100644
--- a/test/pytest/test_pointwiseconv.py
+++ b/test/pytest/test_pointwiseconv.py
@@ -27,8 +27,8 @@
         ('oneAPI', 'io_stream', 'resource', 1),
         ('Vivado', 'io_parallel', 'resource', 1),
         ('Vitis', 'io_parallel', 'resource', 1),
-        ('Vivado', 'io_parallel', 'latency',  1),
-        ('Vitis', 'io_parallel', 'latency',  1),
+        ('Vivado', 'io_parallel', 'latency', 1),
+        ('Vitis', 'io_parallel', 'latency', 1),
         ('Vivado', 'io_parallel', 'latency', 14),
         ('Vitis', 'io_parallel', 'latency', 14),
         ('Vivado', 'io_stream', 'latency', 1),
@@ -66,8 +66,7 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, rf):
     config['LayerName']['pointwise1d']['ReuseFactor'] = rf
 
     output_dir = str(
-        test_root_path
-        / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{backend}_{io_type}_{strategy}_rf{rf}'
+        test_root_path / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{backend}_{io_type}_{strategy}_rf{rf}'
     )
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend

From 8ebeefec338bc9aa8380a8e74fd611731d7c7c19 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Tue, 12 Nov 2024 20:37:58 -0800
Subject: [PATCH 28/41] jovan comments

---
 .../vivado/passes/pointwise_codegen.py        | 61 ++++++++++++++++++-
 hls4ml/backends/vivado/vivado_backend.py      | 58 ------------------
 .../vitis/nnet_utils/nnet_conv1d_latency.h    |  2 +-
 .../vivado/nnet_utils/nnet_conv1d_latency.h   |  2 +-
 4 files changed, 62 insertions(+), 61 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/pointwise_codegen.py b/hls4ml/backends/vivado/passes/pointwise_codegen.py
index f459d59208..cb26fb6530 100644
--- a/hls4ml/backends/vivado/passes/pointwise_codegen.py
+++ b/hls4ml/backends/vivado/passes/pointwise_codegen.py
@@ -3,6 +3,65 @@
 from hls4ml.model.types import Source
 
 
+def generate_pointwise_conv1d_fn(layer_idx, reuse_factor=1):
+    """Generate a C++ function for a pointwise convolution layer.
+
+    Args:
+        layer_idx (int): Index of layer ('index' attribute).
+        reuse_factor (int): Number of partitions to divide the input into.
+
+    Returns:
+        str: Generated C++ function
+    """
+
+    generated_code = (
+        "template<class data_T, class res_T, typename CONFIG_T>\n"
+        "class pointwise_conv_{index} : public PointwiseConv1D<data_T, res_T, CONFIG_T> {{\n"
+        "  public:\n"
+        "    static void pointwise_conv(\n"
+        "                               data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n"
+        "                               res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n"
+        "                               typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n"
+        "                               typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n"
+        "        data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n"  # noqa: E501
+        "        #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n"
+        "        res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n"  # noqa: E501
+        "        #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n"
+        "    RFInputLoop:\n"
+        "        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n"
+        "        #pragma HLS UNROLL\n"
+        "        InnerInputLoop:\n"
+        "            for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n"
+        "                #pragma HLS UNROLL\n"
+        "                data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n"  # noqa: E501
+        "            }}\n"
+        "        }}\n\n"
+    ).format(index=layer_idx)
+    indent = "        "
+    for i in range(reuse_factor):
+        generated_code += indent
+        generated_code += (
+            f"pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[{i}], res_tmp[{i}], weights, biases);\n"
+        )
+
+    generated_code += (
+        "\n"
+        "    RFOutputLoop:\n"
+        "        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n"
+        "        #pragma HLS UNROLL\n"
+        "        InnerOutputLoop:\n"
+        "            for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n"
+        "                #pragma HLS UNROLL\n"
+        "                res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n"  # noqa: E501
+        "            }\n"
+        "        }\n"
+        "    }\n"
+        "};\n"
+    )
+
+    return generated_code
+
+
 class GeneratePointwiseConv1D(OptimizerPass):
     '''Generates code for pointwise 1D convolution'''
 
@@ -17,7 +76,7 @@ def transform(self, model, node):
             raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})')
 
     def _generate_pointwise_conv1d(self, node):
-        code_str = node.model.config.backend.generate_pointwise_conv1d_fn(
+        code_str = generate_pointwise_conv1d_fn(
             node.get_attr('index'),
             node.get_attr('reuse_factor'),
         )
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 8df4b86364..02d3ba17bb 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -649,61 +649,3 @@ def init_garnet(self, layer):
     @layer_optimizer(GarNetStack)
     def init_garnet_stack(self, layer):
         self.init_garnet(layer)
-
-    def generate_pointwise_conv1d_fn(self, layer_idx, reuse_factor=1):
-        """Generate a C++ function for a pointwise convolution layer.
-
-        Args:
-            layer_idx (int): Index of layer ('index' attribute).
-            reuse_factor (int): Number of partitions to divide the input into.
-
-        Returns:
-            str: Generated C++ function
-        """
-
-        generated_code = (
-            "template<class data_T, class res_T, typename CONFIG_T>\n"
-            "class pointwise_conv_{index} : public PointwiseConv1D<data_T, res_T, CONFIG_T> {{\n"
-            "  public:\n"
-            "    static void pointwise_conv(\n"
-            "                               data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n"
-            "                               res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n"
-            "                               typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n"
-            "                               typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n"
-            "        data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n"  # noqa: E501
-            "        #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n"
-            "        res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n"  # noqa: E501
-            "        #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n"
-            "    RFInputLoop:\n"
-            "        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n"
-            "        #pragma HLS UNROLL\n"
-            "        InnerInputLoop:\n"
-            "            for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n"
-            "                #pragma HLS UNROLL\n"
-            "                data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n"  # noqa: E501
-            "            }}\n"
-            "        }}\n\n"
-        ).format(index=layer_idx)
-        indent = "        "
-        for i in range(reuse_factor):
-            generated_code += indent
-            generated_code += (
-                f"pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[{i}], res_tmp[{i}], weights, biases);\n"
-            )
-
-        generated_code += (
-            "\n"
-            "    RFOutputLoop:\n"
-            "        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n"
-            "        #pragma HLS UNROLL\n"
-            "        InnerOutputLoop:\n"
-            "            for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n"
-            "                #pragma HLS UNROLL\n"
-            "                res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n"  # noqa: E501
-            "            }\n"
-            "        }\n"
-            "    }\n"
-            "};\n"
-        )
-
-        return generated_code
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
index 9102a038fd..f93fd0b269 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
@@ -127,7 +127,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
                     (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
                     mult[index_mult] = 0;
                 } else {
-                    mult[index_mult] = data[index_data] * weights[index_weight];
+                    mult[index_mult] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[index_data], weights[index_weight]);
                 }
             } // end channel loop
         }     // end filter loop
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index 2692f2912c..5c7ab470de 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -126,7 +126,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
                     (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
                     mult[index_mult] = 0;
                 } else {
-                    mult[index_mult] = data[index_data] * weights[index_weight];
+                    mult[index_mult] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[index_data], weights[index_weight]);
                 }
             } // end channel loop
         }     // end filter loop

From 4099c8dd1fe06a1cf39a909d4173af729c516d10 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Tue, 12 Nov 2024 21:09:31 -0800
Subject: [PATCH 29/41] p-clang-format

---
 .../templates/vitis/nnet_utils/nnet_conv1d_latency.h  | 11 ++++++-----
 .../templates/vivado/nnet_utils/nnet_conv1d_latency.h | 11 ++++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
index f93fd0b269..49e6ae3505 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
@@ -127,11 +127,12 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
                     (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
                     mult[index_mult] = 0;
                 } else {
-                    mult[index_mult] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[index_data], weights[index_weight]);
+                    mult[index_mult] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(
+                        data[index_data], weights[index_weight]);
                 }
             } // end channel loop
-        }     // end filter loop
-    }         // end output loop
+        } // end filter loop
+    } // end output loop
 
     // Initialize accumulator with input biases
     for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
@@ -152,8 +153,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
                 int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
                 acc[ii][ff] += mult[index_mult];
             } // end channel loop
-        }     // end filter loop
-    }         // end output loop
+        } // end filter loop
+    } // end output loop
 
     // Cast to "res_t" type
     for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index 5c7ab470de..cac2d29f1b 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -126,11 +126,12 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
                     (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
                     mult[index_mult] = 0;
                 } else {
-                    mult[index_mult] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[index_data], weights[index_weight]);
+                    mult[index_mult] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(
+                        data[index_data], weights[index_weight]);
                 }
             } // end channel loop
-        }     // end filter loop
-    }         // end output loop
+        } // end filter loop
+    } // end output loop
 
     // Initialize accumulator with input biases
     for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
@@ -151,8 +152,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
                 int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
                 acc[ii][ff] += mult[index_mult];
             } // end channel loop
-        }     // end filter loop
-    }         // end output loop
+        } // end filter loop
+    } // end output loop
 
     // Cast to "res_t" type
     for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {

From d999ad8553e37f93ced03bd398e80dc0e1849891 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Tue, 12 Nov 2024 21:21:46 -0800
Subject: [PATCH 30/41] p-clang-format

---
 hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h  | 8 ++++----
 hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
index 49e6ae3505..e32e82135f 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
@@ -131,8 +131,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
                         data[index_data], weights[index_weight]);
                 }
             } // end channel loop
-        } // end filter loop
-    } // end output loop
+        }     // end filter loop
+    }         // end output loop
 
     // Initialize accumulator with input biases
     for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
@@ -153,8 +153,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
                 int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
                 acc[ii][ff] += mult[index_mult];
             } // end channel loop
-        } // end filter loop
-    } // end output loop
+        }     // end filter loop
+    }         // end output loop
 
     // Cast to "res_t" type
     for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index cac2d29f1b..51409dd102 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -130,8 +130,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
                         data[index_data], weights[index_weight]);
                 }
             } // end channel loop
-        } // end filter loop
-    } // end output loop
+        }     // end filter loop
+    }         // end output loop
 
     // Initialize accumulator with input biases
     for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
@@ -152,8 +152,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
                 int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
                 acc[ii][ff] += mult[index_mult];
             } // end channel loop
-        } // end filter loop
-    } // end output loop
+        }     // end filter loop
+    }         // end output loop
 
     // Cast to "res_t" type
     for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {

From 5e5b81fbfd23a5e93e5d72f8b3776a8305dd40ea Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Wed, 13 Nov 2024 20:59:57 +0100
Subject: [PATCH 31/41] Introduce optional description to layer attributes

---
 docs/attr_doc_gen.py                         | 99 ++++++++++++++++++++
 hls4ml/backends/catapult/catapult_backend.py | 23 +++--
 hls4ml/backends/fpga/fpga_backend.py         | 24 +++--
 hls4ml/backends/oneapi/oneapi_backend.py     |  9 +-
 hls4ml/backends/quartus/quartus_backend.py   |  7 +-
 hls4ml/backends/vivado/vivado_backend.py     | 23 +++--
 hls4ml/model/attributes.py                   | 26 ++---
 hls4ml/model/layers.py                       |  7 +-
 hls4ml/utils/attribute_descriptions.py       | 46 +++++++++
 9 files changed, 222 insertions(+), 42 deletions(-)
 create mode 100644 docs/attr_doc_gen.py
 create mode 100644 hls4ml/utils/attribute_descriptions.py

diff --git a/docs/attr_doc_gen.py b/docs/attr_doc_gen.py
new file mode 100644
index 0000000000..8bade86833
--- /dev/null
+++ b/docs/attr_doc_gen.py
@@ -0,0 +1,99 @@
+import numbers
+
+import hls4ml.backends as backends
+import hls4ml.model.attributes as attributes
+import hls4ml.model.layers as layers
+
+all_backends = backends.get_available_backends()
+# Removing duplicates but preserving order
+all_layers = list(dict.fromkeys(layers.layer_map.values()))
+
+
+class AttrList:
+    def __init__(self, cls_name, cls_attrs) -> None:
+        self.cls_name = cls_name
+        self.config_attrs = [attr for attr in cls_attrs if attr.configurable == True]
+        self.type_attrs = [attr for attr in cls_attrs if attr.__class__.__name__ == 'TypeAttribute']
+        self.weight_attrs = [attr for attr in cls_attrs if attr.__class__.__name__ == 'WeightAttribute']
+        self.base_attrs = [attr for attr in cls_attrs if attr not in self.config_attrs + self.type_attrs + self.weight_attrs]
+        self.backend_attrs = {}
+
+    def add_backend_attrs(self, backend_name, backend_attrs):
+        self.backend_attrs[backend_name] = backend_attrs
+
+
+attr_map = []
+
+for layer_cls in all_layers:
+    base_attrs = layer_cls.expected_attributes
+
+    attr_list = AttrList(layer_cls.__name__, base_attrs)
+
+    for backend_name in all_backends:
+        backend = backends.get_backend(backend_name)
+
+        backend_cls = backend.create_layer_class(layer_cls)
+        backend_attrs = backend_cls.expected_attributes
+
+        diff_atts = [attr for attr in backend_attrs if attr not in base_attrs]  # Sets are faster, but don't preserve order
+        if len(diff_atts) > 0:
+            attr_list.add_backend_attrs(backend.name, diff_atts)
+
+    attr_map.append(attr_list)
+
+
+def print_attrs(attrs, file):
+    for attr in attrs:
+        if attr.value_type == numbers.Integral:
+            vtype = 'int'
+        elif attr.__class__ == attributes.ChoiceAttribute:
+            choices = ','.join([str(c) for c in attr.choices])
+            vtype = f'list [{choices}]'
+        else:
+            vtype = attr.value_type.__name__ if hasattr(attr.value_type, '__name__') else str(attr.value_type)
+
+        if attr.default is None:
+            file.write('* ' + attr.name + ': ' + vtype + '\n\n')
+        else:
+            file.write('* ' + attr.name + ': ' + vtype + ' (Default: ' + str(attr.default) + ')\n\n')
+
+        if attr.description is not None:
+            file.write('  * ' + attr.description + '\n\n')
+
+
+with open('attributes.rst', mode='w') as file:
+    file.write('================\n')
+    file.write('Layer attributes\n')
+    file.write('================\n\n\n')
+
+    for attr_list in attr_map:
+        file.write(attr_list.cls_name + '\n')
+        file.write('=' * len(attr_list.cls_name) + '\n')
+
+        if len(attr_list.base_attrs) > 0:
+            file.write('Base attributes\n')
+            file.write('---------------\n')
+            print_attrs(attr_list.type_attrs, file)
+
+        if len(attr_list.type_attrs) > 0:
+            file.write('Type attributes\n')
+            file.write('---------------\n')
+            print_attrs(attr_list.base_attrs, file)
+
+        if len(attr_list.weight_attrs) > 0:
+            file.write('Weight attributes\n')
+            file.write('-----------------\n')
+            print_attrs(attr_list.weight_attrs, file)
+
+        if len(attr_list.config_attrs) > 0:
+            file.write('Configurable attributes\n')
+            file.write('-----------------------\n')
+            print_attrs(attr_list.config_attrs, file)
+
+        if len(attr_list.backend_attrs) > 0:
+            file.write('Backend attributes\n')
+            file.write('-----------------------\n')
+            for backend, backend_attrs in attr_list.backend_attrs.items():
+                file.write(backend + '\n')
+                file.write('^' * len(backend) + '\n')
+                print_attrs(backend_attrs, file)
diff --git a/hls4ml/backends/catapult/catapult_backend.py b/hls4ml/backends/catapult/catapult_backend.py
index 5c85bf9b7e..28e13edf37 100644
--- a/hls4ml/backends/catapult/catapult_backend.py
+++ b/hls4ml/backends/catapult/catapult_backend.py
@@ -32,6 +32,7 @@
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, PackedType
 from hls4ml.report import parse_catapult_report
+from hls4ml.utils import attribute_descriptions as descriptions
 from hls4ml.utils.fixed_point_utils import ceil_log2
 
 
@@ -51,10 +52,12 @@ def _register_layer_attributes(self):
 
         for layer in rnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1))
-            attrs.append(ConfigurableAttribute('static', value_type=bool, default=True))
-            attrs.append(ConfigurableAttribute('table_size', default=1024))
-            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1, description=descriptions.reuse_factor))
+            attrs.append(
+                ConfigurableAttribute('static', value_type=bool, default=True, description=descriptions.recurrent_static)
+            )
+            attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size))
+            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type))
             self.attribute_map[layer] = attrs
 
         # Add ParallelizationFactor to Conv1D/2D
@@ -65,7 +68,7 @@ def _register_layer_attributes(self):
 
         for layer in pf_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('parallelization_factor', default=1))
+            attrs.append(ConfigurableAttribute('parallelization_factor', default=1, description=descriptions.conv_pf))
             self.attribute_map[layer] = attrs
 
         # Add ConvImplementation to Convolution+Pooling layers
@@ -73,8 +76,14 @@ def _register_layer_attributes(self):
 
         for layer in cnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer'))
-            attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer'))
+            attrs.append(
+                ChoiceAttribute(
+                    'conv_implementation',
+                    choices=['LineBuffer', 'Encoded'],
+                    default='LineBuffer',
+                    description=descriptions.conv_implementation,
+                )
+            )
             self.attribute_map[layer] = attrs
 
         sep_conv_layers = [SeparableConv1D, SeparableConv2D]
diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
index a9fc09b7aa..fbfed71c5b 100644
--- a/hls4ml/backends/fpga/fpga_backend.py
+++ b/hls4ml/backends/fpga/fpga_backend.py
@@ -45,6 +45,7 @@
     UnspecifiedPrecisionType,
     XnorPrecisionType,
 )
+from hls4ml.utils import attribute_descriptions as descriptions
 from hls4ml.writer import get_writer
 
 
@@ -74,7 +75,7 @@ def __init__(self, name):
 
         for layer in accum_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(TypeAttribute('accum'))
+            attrs.append(TypeAttribute('accum', description=descriptions.accum_type))
             self.attribute_map[layer] = attrs
 
         rf_layers = accum_layers + [
@@ -90,7 +91,7 @@ def __init__(self, name):
 
         for layer in rf_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('reuse_factor', default=1))
+            attrs.append(ConfigurableAttribute('reuse_factor', default=1, description=descriptions.reuse_factor))
             self.attribute_map[layer] = attrs
 
         # seperable is kind of special because it is effectively two layers that will be split
@@ -104,23 +105,34 @@ def __init__(self, name):
             self.attribute_map[layer] = attrs
 
         act_attrs = self.attribute_map.get(Activation, [])
-        act_attrs.append(ConfigurableAttribute('table_size', default=1024))
-        act_attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+        act_attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size))
+        act_attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type))
         self.attribute_map[Activation] = act_attrs
 
         softmax_attrs = self.attribute_map.get(Softmax, [])
-        softmax_attrs.append(ChoiceAttribute('implementation', ['latency', 'stable', 'argmax', 'legacy'], default='stable'))
-        softmax_attrs.append(ConfigurableAttribute('skip', value_type=bool, default=False))
+        softmax_attrs.append(
+            ChoiceAttribute(
+                'implementation',
+                ['latency', 'stable', 'argmax', 'legacy'],
+                default='stable',
+                description=descriptions.softmax_implementation,
+            )
+        )
+        softmax_attrs.append(
+            ConfigurableAttribute('skip', value_type=bool, default=False, description=descriptions.softmax_skip)
+        )
         softmax_attrs.append(
             TypeAttribute(
                 'exp_table',
                 default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
+                description=descriptions.table_type,
             )
         )
         softmax_attrs.append(
             TypeAttribute(
                 'inv_table',
                 default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
+                description=descriptions.table_type,
             )
         )
         self.attribute_map[Softmax] = softmax_attrs
diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index c85a8c0e94..7d0f0d48e2 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -10,6 +10,7 @@
 from hls4ml.model.layers import GRU, LSTM, Activation, Conv1D, Conv2D, Dense, Embedding, Layer, SimpleRNN, Softmax
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
+from hls4ml.utils import attribute_descriptions as descriptions
 
 # from hls4ml.report import parse_oneapi_report
 
@@ -30,9 +31,9 @@ def _register_layer_attributes(self):
 
         for layer in rnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1))
-            attrs.append(ConfigurableAttribute('table_size', default=1024))
-            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1, description=descriptions.reuse_factor))
+            attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size))
+            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type))
             self.attribute_map[layer] = attrs
 
         # Add ParallelizationFactor to Conv1D/2D
@@ -43,7 +44,7 @@ def _register_layer_attributes(self):
 
         for layer in pf_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('parallelization_factor', default=1))
+            attrs.append(ConfigurableAttribute('parallelization_factor', default=1, description=descriptions.conv_pf))
             self.attribute_map[layer] = attrs
 
     def _register_flows(self):
diff --git a/hls4ml/backends/quartus/quartus_backend.py b/hls4ml/backends/quartus/quartus_backend.py
index aecad642c6..e56e1e05a6 100644
--- a/hls4ml/backends/quartus/quartus_backend.py
+++ b/hls4ml/backends/quartus/quartus_backend.py
@@ -11,6 +11,7 @@
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
 from hls4ml.report import parse_quartus_report
+from hls4ml.utils import attribute_descriptions as descriptions
 
 
 @contextmanager
@@ -39,9 +40,9 @@ def _register_layer_attributes(self):
 
         for layer in rnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1))
-            attrs.append(ConfigurableAttribute('table_size', default=1024))
-            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1, description=descriptions.reuse_factor))
+            attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size))
+            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type))
             self.attribute_map[layer] = attrs
 
     def _register_flows(self):
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 9f8a5171d3..96c13f4b37 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -31,6 +31,7 @@
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, PackedType
 from hls4ml.report import parse_vivado_report
+from hls4ml.utils import attribute_descriptions as descriptions
 
 
 class VivadoBackend(FPGABackend):
@@ -49,10 +50,12 @@ def _register_layer_attributes(self):
 
         for layer in rnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1))
-            attrs.append(ConfigurableAttribute('static', value_type=bool, default=True))
-            attrs.append(ConfigurableAttribute('table_size', default=1024))
-            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1, description=descriptions.reuse_factor))
+            attrs.append(
+                ConfigurableAttribute('static', value_type=bool, default=True, description=descriptions.recurrent_static)
+            )
+            attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size))
+            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type))
             self.attribute_map[layer] = attrs
 
         # Add ParallelizationFactor to Conv1D/2D
@@ -63,15 +66,21 @@ def _register_layer_attributes(self):
 
         for layer in pf_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('parallelization_factor', default=1))
+            attrs.append(ConfigurableAttribute('parallelization_factor', default=1, description=descriptions.conv_pf))
             self.attribute_map[layer] = attrs
 
         # Add ConvImplementation to Convolution+Pooling layers
         cnn_layers = [Conv1D, Conv2D, SeparableConv1D, SeparableConv2D, DepthwiseConv2D, Pooling1D, Pooling2D]
         for layer in cnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer'))
-            attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer'))
+            attrs.append(
+                ChoiceAttribute(
+                    'conv_implementation',
+                    choices=['LineBuffer', 'Encoded'],
+                    default='LineBuffer',
+                    description=descriptions.conv_implementation,
+                )
+            )
             self.attribute_map[layer] = attrs
 
     def _register_flows(self):
diff --git a/hls4ml/model/attributes.py b/hls4ml/model/attributes.py
index 0e8df6e10a..d4aef63409 100644
--- a/hls4ml/model/attributes.py
+++ b/hls4ml/model/attributes.py
@@ -36,11 +36,12 @@ class Attribute:
 
     """
 
-    def __init__(self, name, value_type=Integral, default=None, configurable=False):
+    def __init__(self, name, value_type=Integral, default=None, configurable=False, description=None):
         self.name = name
         self.value_type = value_type
         self.default = default
         self.configurable = configurable
+        self.description = description
 
     def validate_value(self, value):
         if self.value_type is not None:
@@ -68,8 +69,8 @@ class ConfigurableAttribute(Attribute):
     when defining the expected attributes of layer classes.
     """
 
-    def __init__(self, name, value_type=int, default=None):
-        super().__init__(name, value_type, default, configurable=True)
+    def __init__(self, name, value_type=int, default=None, description=None):
+        super().__init__(name, value_type, default, configurable=True, description=description)
 
 
 class TypeAttribute(Attribute):
@@ -79,10 +80,10 @@ class TypeAttribute(Attribute):
     As a convention, the name of the attribute storing a type will end in ``_t``.
     """
 
-    def __init__(self, name, default=None, configurable=True):
+    def __init__(self, name, default=None, configurable=True, description=None):
         if not name.endswith('_t'):
             name += '_t'
-        super().__init__(name, value_type=NamedType, default=default, configurable=configurable)
+        super().__init__(name, value_type=NamedType, default=default, configurable=configurable, description=description)
 
 
 class ChoiceAttribute(Attribute):
@@ -90,13 +91,12 @@ class ChoiceAttribute(Attribute):
     Represents an attribute whose value can be one of several predefined values.
     """
 
-    def __init__(self, name, choices, default=None, configurable=True):
-        super().__init__(name, value_type=list, default=default, configurable=configurable)
+    def __init__(self, name, choices, default=None, configurable=True, description=None):
+        super().__init__(name, value_type=list, default=default, configurable=configurable, description=description)
         assert len(choices) > 0
         if default is not None:
             assert default in choices
         self.choices = choices
-        self.value_type = str(self.choices)
 
     def validate_value(self, value):
         return value in self.choices
@@ -107,8 +107,8 @@ class WeightAttribute(Attribute):
     Represents an attribute that will store a weight variable.
     """
 
-    def __init__(self, name):
-        super().__init__(name, value_type=WeightVariable, default=None, configurable=False)
+    def __init__(self, name, description=None):
+        super().__init__(name, value_type=WeightVariable, default=None, configurable=False, description=description)
 
 
 class CodeAttrubute(Attribute):
@@ -116,8 +116,10 @@ class CodeAttrubute(Attribute):
     Represents an attribute that will store generated source code block.
     """
 
-    def __init__(self, name):
-        super(WeightAttribute, self).__init__(name, value_type=Source, default=None, configurable=False)
+    def __init__(self, name, description=None):
+        super(WeightAttribute, self).__init__(
+            name, value_type=Source, default=None, configurable=False, description=description
+        )
 
 
 # endregion
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 7dbeb4567e..45357344a6 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -26,6 +26,7 @@
     WeightVariable,
     find_minimum_width,
 )
+from hls4ml.utils import attribute_descriptions as descriptions
 from hls4ml.utils.string_utils import convert_to_snake_case
 
 
@@ -53,9 +54,9 @@ class Layer:
     """
 
     _expected_attributes = [
-        Attribute('index'),
-        ConfigurableAttribute('trace', default=False),
-        TypeAttribute('result'),
+        Attribute('index', description=descriptions.index),
+        ConfigurableAttribute('trace', default=False, description=descriptions.trace),
+        TypeAttribute('result', description=descriptions.result_type),
     ]
 
     @classproperty
diff --git a/hls4ml/utils/attribute_descriptions.py b/hls4ml/utils/attribute_descriptions.py
new file mode 100644
index 0000000000..92cee9e791
--- /dev/null
+++ b/hls4ml/utils/attribute_descriptions.py
@@ -0,0 +1,46 @@
+"""Strings holding attribute descriptions."""
+
+# Common attributes
+
+reuse_factor = (
+    'The number of times each multiplier is used by controlling the amount of pipelining/unrolling. '
+    'Lower number results in more parallelism and lower latency at the expense of the resources used.'
+)
+
+index = 'Internal node counter used for bookkeeping and variable/tensor naming.'
+trace = 'Enables saving of layer output (tracing).'
+
+result_type = 'The datatype (precision) of the output tensor.'
+accum_type = 'The datatype (precision) used to store intermediate results of the computation within the layer.'
+
+# Activation-related attributes
+
+table_size = 'The size of the lookup table used to approximate the function.'
+table_type = 'The datatype (precision) used for the values of the lookup table.'
+
+softmax_implementation = (
+    'Choice of implementation of softmax function. '
+    '"latency" provides good latency at the expense of extra resources. performs well on small number of classes. '
+    '"stable" may require extra clock cycles but has better accuracy. '
+    '"legacy" is the older implementation which has bad accuracy, but is fast and has low resource use. '
+    'It is superseded by the "latency" implementation for most applications. '
+    '"argmax" is a special implementation that can be used if only the output with the highest probability is important. '
+    'Using this implementation will save resources and clock cycles.'
+)
+softmax_skip = 'If enabled, skips the softmax node and returns the raw outputs.'
+
+# Convolution-related attributes
+
+conv_pf = (
+    'The number of outputs computed in parallel. Essentially the number of multiplications of input window with the '
+    'convolution kernel occuring in parallel. '
+    'Higher number results in more parallelism (lower latency and II) at the expense of resources used.'
+)
+conv_implementation = '"LineBuffer" implementation is preferred over "Encoded" for most use cases.'
+
+# Recurrent-related attributes
+
+recurrent_static = (
+    'If set to True, will reuse the the same recurrent block for computation, resulting in lower resource '
+    'usage at the expense of serialized computation and higher latency/II.'
+)

From 1214b65a5e89ab6ce52429d131947014545aeeb0 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Wed, 13 Nov 2024 21:17:14 +0100
Subject: [PATCH 32/41] Pre-commit fix

---
 docs/attr_doc_gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/attr_doc_gen.py b/docs/attr_doc_gen.py
index 8bade86833..605c7669e4 100644
--- a/docs/attr_doc_gen.py
+++ b/docs/attr_doc_gen.py
@@ -12,7 +12,7 @@
 class AttrList:
     def __init__(self, cls_name, cls_attrs) -> None:
         self.cls_name = cls_name
-        self.config_attrs = [attr for attr in cls_attrs if attr.configurable == True]
+        self.config_attrs = [attr for attr in cls_attrs if attr.configurable is True]
         self.type_attrs = [attr for attr in cls_attrs if attr.__class__.__name__ == 'TypeAttribute']
         self.weight_attrs = [attr for attr in cls_attrs if attr.__class__.__name__ == 'WeightAttribute']
         self.base_attrs = [attr for attr in cls_attrs if attr not in self.config_attrs + self.type_attrs + self.weight_attrs]

From daae96d4f75f3a0ac065ca8fb27021d28fa723a7 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Wed, 13 Nov 2024 21:29:42 -0800
Subject: [PATCH 33/41] fix

---
 hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h  | 2 +-
 hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
index e32e82135f..b5f29fda06 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
@@ -127,7 +127,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
                     (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
                     mult[index_mult] = 0;
                 } else {
-                    mult[index_mult] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(
+                    mult[index_mult] = CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::weight_t>::product(
                         data[index_data], weights[index_weight]);
                 }
             } // end channel loop
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index 51409dd102..b04485af9a 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -126,7 +126,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
                     (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
                     mult[index_mult] = 0;
                 } else {
-                    mult[index_mult] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(
+                    mult[index_mult] = CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::weight_t>::product(
                         data[index_data], weights[index_weight]);
                 }
             } // end channel loop

From e813d41fdedfc1d704446c7d4310e12b4ca47c7f Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Wed, 20 Nov 2024 22:11:41 +0100
Subject: [PATCH 34/41] Tweak writing of all attributes, allow writing only
 configurable attributes

---
 docs/attr_doc_gen.py       | 154 ++++++++++++++++++++++++-------------
 hls4ml/model/attributes.py |  27 ++++++-
 hls4ml/model/types.py      |  15 +++-
 3 files changed, 137 insertions(+), 59 deletions(-)

diff --git a/docs/attr_doc_gen.py b/docs/attr_doc_gen.py
index 605c7669e4..0ba2a5b77e 100644
--- a/docs/attr_doc_gen.py
+++ b/docs/attr_doc_gen.py
@@ -4,10 +4,6 @@
 import hls4ml.model.attributes as attributes
 import hls4ml.model.layers as layers
 
-all_backends = backends.get_available_backends()
-# Removing duplicates but preserving order
-all_layers = list(dict.fromkeys(layers.layer_map.values()))
-
 
 class AttrList:
     def __init__(self, cls_name, cls_attrs) -> None:
@@ -17,29 +13,61 @@ def __init__(self, cls_name, cls_attrs) -> None:
         self.weight_attrs = [attr for attr in cls_attrs if attr.__class__.__name__ == 'WeightAttribute']
         self.base_attrs = [attr for attr in cls_attrs if attr not in self.config_attrs + self.type_attrs + self.weight_attrs]
         self.backend_attrs = {}
+        self.reverse_backend_attrs = []  # Will hold (attr, backend_name) pairs, used temporarily
+        self.unique_backend_attrs = []
 
     def add_backend_attrs(self, backend_name, backend_attrs):
         self.backend_attrs[backend_name] = backend_attrs
 
+        for attr in backend_attrs:
+            self.reverse_backend_attrs.append((attr, backend_name))
+
+    def sift_backend_attrs(self):
+        grouped_dict = {}
+        for attr, backend_name in self.reverse_backend_attrs:
+            if attr not in grouped_dict:
+                grouped_dict[attr] = []
+            grouped_dict[attr].append(backend_name)
+
+        for attr, backend_names in grouped_dict.items():
+            attr.available_in = backend_names
+            self.unique_backend_attrs.append(attr)
+
+    @property
+    def only_configurable(self):
+        all_attrs = self.config_attrs + self.type_attrs + self.unique_backend_attrs
+        return [attr for attr in all_attrs if attr.configurable is True]
+
 
-attr_map = []
+def convert_to_attr_list():
+    all_backends = backends.get_available_backends()
+    # Removing duplicates but preserving order
+    all_layers = list(dict.fromkeys(layers.layer_map.values()))
+    all_layers_attrs = []
 
-for layer_cls in all_layers:
-    base_attrs = layer_cls.expected_attributes
+    for layer_cls in all_layers:
+        base_attrs = layer_cls.expected_attributes
 
-    attr_list = AttrList(layer_cls.__name__, base_attrs)
+        attr_list = AttrList(layer_cls.__name__, base_attrs)
 
-    for backend_name in all_backends:
-        backend = backends.get_backend(backend_name)
+        for backend_name in all_backends:
+            backend = backends.get_backend(backend_name)
 
-        backend_cls = backend.create_layer_class(layer_cls)
-        backend_attrs = backend_cls.expected_attributes
+            backend_cls = backend.create_layer_class(layer_cls)
+            backend_attrs = backend_cls.expected_attributes
 
-        diff_atts = [attr for attr in backend_attrs if attr not in base_attrs]  # Sets are faster, but don't preserve order
-        if len(diff_atts) > 0:
-            attr_list.add_backend_attrs(backend.name, diff_atts)
+            diff_atts = [
+                attr for attr in backend_attrs if attr not in base_attrs
+            ]  # Sets are faster, but don't preserve order
+            if len(diff_atts) > 0:
+                attr_list.add_backend_attrs(backend.name, diff_atts)
 
-    attr_map.append(attr_list)
+        all_layers_attrs.append(attr_list)
+
+    for attr_list in all_layers_attrs:
+        attr_list.sift_backend_attrs()
+
+    return all_layers_attrs
 
 
 def print_attrs(attrs, file):
@@ -60,40 +88,62 @@ def print_attrs(attrs, file):
         if attr.description is not None:
             file.write('  * ' + attr.description + '\n\n')
 
+        if hasattr(attr, 'available_in'):
+            file.write('  * Available in: ' + ', '.join(attr.available_in) + '\n\n')
+
+
+def write_all_attributes(all_layers_attrs):
+    with open('attributes.rst', mode='w') as file:
+        file.write('================\n')
+        file.write('Layer attributes\n')
+        file.write('================\n\n\n')
+
+        for attr_list in all_layers_attrs:
+            file.write(attr_list.cls_name + '\n')
+            file.write('=' * len(attr_list.cls_name) + '\n')
+
+            if len(attr_list.base_attrs) > 0:
+                file.write('Base attributes\n')
+                file.write('---------------\n')
+                print_attrs(attr_list.type_attrs, file)
+
+            if len(attr_list.type_attrs) > 0:
+                file.write('Type attributes\n')
+                file.write('---------------\n')
+                print_attrs(attr_list.base_attrs, file)
+
+            if len(attr_list.weight_attrs) > 0:
+                file.write('Weight attributes\n')
+                file.write('-----------------\n')
+                print_attrs(attr_list.weight_attrs, file)
+
+            if len(attr_list.config_attrs) > 0:
+                file.write('Configurable attributes\n')
+                file.write('-----------------------\n')
+                print_attrs(attr_list.config_attrs, file)
+
+            if len(attr_list.backend_attrs) > 0:
+                file.write('Backend-specific attributes\n')
+                file.write('---------------------------\n')
+                print_attrs(attr_list.unique_backend_attrs, file)
+
+
+def write_only_configurable(all_layers_attrs):
+    with open('attributes.rst', mode='w') as file:
+        file.write('================\n')
+        file.write('Layer attributes\n')
+        file.write('================\n\n\n')
+
+        for attr_list in all_layers_attrs:
+            file.write(attr_list.cls_name + '\n')
+            file.write('=' * len(attr_list.cls_name) + '\n')
+
+            config_attrs = attr_list.only_configurable
+            if len(config_attrs) > 0:
+                print_attrs(config_attrs, file)
+
 
-with open('attributes.rst', mode='w') as file:
-    file.write('================\n')
-    file.write('Layer attributes\n')
-    file.write('================\n\n\n')
-
-    for attr_list in attr_map:
-        file.write(attr_list.cls_name + '\n')
-        file.write('=' * len(attr_list.cls_name) + '\n')
-
-        if len(attr_list.base_attrs) > 0:
-            file.write('Base attributes\n')
-            file.write('---------------\n')
-            print_attrs(attr_list.type_attrs, file)
-
-        if len(attr_list.type_attrs) > 0:
-            file.write('Type attributes\n')
-            file.write('---------------\n')
-            print_attrs(attr_list.base_attrs, file)
-
-        if len(attr_list.weight_attrs) > 0:
-            file.write('Weight attributes\n')
-            file.write('-----------------\n')
-            print_attrs(attr_list.weight_attrs, file)
-
-        if len(attr_list.config_attrs) > 0:
-            file.write('Configurable attributes\n')
-            file.write('-----------------------\n')
-            print_attrs(attr_list.config_attrs, file)
-
-        if len(attr_list.backend_attrs) > 0:
-            file.write('Backend attributes\n')
-            file.write('-----------------------\n')
-            for backend, backend_attrs in attr_list.backend_attrs.items():
-                file.write(backend + '\n')
-                file.write('^' * len(backend) + '\n')
-                print_attrs(backend_attrs, file)
+if __name__ == '__main__':
+    all_layers_attrs = convert_to_attr_list()
+    write_all_attributes(all_layers_attrs)
+    # write_only_configurable(all_layers_attrs)
diff --git a/hls4ml/model/attributes.py b/hls4ml/model/attributes.py
index d4aef63409..d03d2bd108 100644
--- a/hls4ml/model/attributes.py
+++ b/hls4ml/model/attributes.py
@@ -60,6 +60,20 @@ def config_name(self):
         """
         return convert_to_pascal_case(self.name)
 
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Attribute):
+            return NotImplemented
+        return (
+            self.name == other.name
+            and self.value_type == other.value_type
+            and self.default == other.default
+            and self.configurable == other.configurable
+            and self.description == other.description
+        )
+
+    def __hash__(self) -> int:
+        return hash((self.name, self.value_type, self.default, self.configurable, self.description))
+
 
 class ConfigurableAttribute(Attribute):
     """
@@ -69,7 +83,7 @@ class ConfigurableAttribute(Attribute):
     when defining the expected attributes of layer classes.
     """
 
-    def __init__(self, name, value_type=int, default=None, description=None):
+    def __init__(self, name, value_type=Integral, default=None, description=None):
         super().__init__(name, value_type, default, configurable=True, description=description)
 
 
@@ -101,6 +115,13 @@ def __init__(self, name, choices, default=None, configurable=True, description=N
     def validate_value(self, value):
         return value in self.choices
 
+    def __eq__(self, other: object) -> bool:
+        base_eq = super().__eq__(other)
+        return base_eq and hasattr(other, 'choices') and set(self.choices) == set(other.choices)
+
+    def __hash__(self) -> int:
+        return super().__hash__() ^ hash(tuple(sorted(self.choices)))
+
 
 class WeightAttribute(Attribute):
     """
@@ -117,9 +138,7 @@ class CodeAttrubute(Attribute):
     """
 
     def __init__(self, name, description=None):
-        super(WeightAttribute, self).__init__(
-            name, value_type=Source, default=None, configurable=False, description=description
-        )
+        super().__init__(name, value_type=Source, default=None, configurable=False, description=description)
 
 
 # endregion
diff --git a/hls4ml/model/types.py b/hls4ml/model/types.py
index 9fb257a1ef..9d0a97440f 100644
--- a/hls4ml/model/types.py
+++ b/hls4ml/model/types.py
@@ -64,12 +64,15 @@ def __init__(self, width, signed):
         self.width = width
         self.signed = signed
 
-    def __eq__(self, other):
+    def __eq__(self, other: object) -> bool:
         eq = self.width == other.width
         eq = eq and self.signed == other.signed
 
         return eq
 
+    def __hash__(self) -> int:
+        return hash((self.width, self.signed))
+
 
 class IntegerPrecisionType(PrecisionType):
     """Arbitrary precision integer  data type.
@@ -89,12 +92,15 @@ def __str__(self):
         return typestring
 
     # Does this need to make sure other is also an IntegerPrecisionType? I could see a match between Fixed and Integer
-    def __eq__(self, other):
+    def __eq__(self, other: object) -> bool:
         if isinstance(other, IntegerPrecisionType):
             return super().__eq__(other)
 
         return False
 
+    def __hash__(self) -> int:
+        return super().__hash__()
+
     @property
     def integer(self):
         return self.width
@@ -186,7 +192,7 @@ def __str__(self):
         typestring = '{signed}fixed<{args}>'.format(signed='u' if not self.signed else '', args=args)
         return typestring
 
-    def __eq__(self, other):
+    def __eq__(self, other: object) -> bool:
         if isinstance(other, FixedPrecisionType):
             eq = super().__eq__(other)
             eq = eq and self.integer == other.integer
@@ -197,6 +203,9 @@ def __eq__(self, other):
 
         return False
 
+    def __hash__(self) -> int:
+        return super().__hash__() ^ hash((self.integer, self.rounding_mode, self.saturation_mode, self.saturation_bits))
+
 
 class XnorPrecisionType(PrecisionType):
     """

From d56dc7349e94ad8b6aaefe5b0f22385d3d9a9e03 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Thu, 21 Nov 2024 21:28:41 -0800
Subject: [PATCH 35/41] vladimir comments

---
 .../vivado/passes/convolution_templates.py    | 20 +++--
 hls4ml/backends/vivado/passes/pointwise.py    | 36 +--------
 .../vivado/passes/pointwise_codegen.py        | 78 +++++++++----------
 .../templates/vitis/nnet_utils/nnet_conv1d.h  | 32 +++++---
 .../vitis/nnet_utils/nnet_conv1d_latency.h    |  8 +-
 .../templates/vivado/nnet_utils/nnet_conv1d.h | 32 +++++---
 .../vivado/nnet_utils/nnet_conv1d_latency.h   |  8 +-
 .../vivado/nnet_utils/nnet_function_stubs.h   | 10 +++
 8 files changed, 114 insertions(+), 110 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index 02f13ef6f0..551e4f4167 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -60,6 +60,8 @@
     typedef {config_t} mult_config;
     template<unsigned K, unsigned S, unsigned W>
     using scale_index = nnet::{scale_index_type}<K, S, W>;
+    template<class data_T, class res_T, class CONFIG_T>
+    using conv_kernel = nnet::{conv_fn}<data_T, res_T, CONFIG_T>;
 }};
 const ap_uint<config{index}::filt_width> config{index}::pixels[] = {{{instructions}}};\n"""
 
@@ -93,16 +95,24 @@ def format(self, node):
         else:
             params['fill_fn'] = 'FillConv1DBuffer'
 
-        if node.get_attr('filt_width') == 1 and node.model.config.get_config_value('IOType') == 'io_parallel':
-            params['pointwise_fn'] = f'pointwise_conv_{node.index}'
+        is_pointwise_parallel_latency = node.get_attr('filt_width') == 1 and node.get_attr('strategy').lower() == 'latency' and node.model.config.get_config_value('IOType') == 'io_parallel'
+        if is_pointwise_parallel_latency:
+            params['conv_fn'] = f'pointwise_conv_{node.index}'
         else:
-            params['pointwise_fn'] = 'PointwiseConv1D'
+            if node.get_attr('strategy').lower() == 'latency':
+                params['conv_fn'] = 'Conv1DLatency'
+            elif node.get_attr('strategy').lower() == 'resource':
+                params['conv_fn'] = 'Conv1DResource'
 
         conv_config = self.template.format(**params)
 
         mult_params = self._default_config_params(node)
-        mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width')
-        mult_params['n_out'] = node.get_attr('n_filt')
+        if is_pointwise_parallel_latency:
+            mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width') / mult_params['reuse']
+            mult_params['n_out'] = node.get_attr('n_filt') / mult_params['reuse']
+        else:
+            mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width')
+            mult_params['n_out'] = node.get_attr('n_filt')
         mult_params['nzeros'] = node.get_weights('weight').nzeros
         mult_params['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
diff --git a/hls4ml/backends/vivado/passes/pointwise.py b/hls4ml/backends/vivado/passes/pointwise.py
index 79a72c1e6a..34568b09f7 100644
--- a/hls4ml/backends/vivado/passes/pointwise.py
+++ b/hls4ml/backends/vivado/passes/pointwise.py
@@ -4,45 +4,13 @@
     Conv1DFunctionTemplate,
     Conv2DConfigTemplate,
     Conv2DFunctionTemplate,
+    conv1d_config_template,
     conv2d_config_template,
     conv_mult_config_template,
 )
 from hls4ml.model.layers import register_layer
 from hls4ml.model.optimizer import OptimizerPass
 
-pointwise_conv1d_config_template = """struct config{index} : nnet::conv1d_config {{
-    static const unsigned pad_left = {pad_left};
-    static const unsigned pad_right = {pad_right};
-    static const unsigned in_width = {in_width};
-    static const unsigned n_chan = {n_chan};
-    static const unsigned filt_width = {filt_width};
-    static const unsigned kernel_size = filt_width;
-    static const unsigned n_filt = {n_filt};
-    static const unsigned stride_width = {stride_width};
-    static const unsigned dilation = {dilation};
-    static const unsigned out_width = {out_width};
-    static const unsigned reuse_factor = {reuse};
-    static const unsigned n_zeros = {nzeros};
-    static const bool store_weights_in_bram = false;
-    static const unsigned strategy = nnet::{strategy};
-    static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
-    static const unsigned min_width = {min_width};
-    static const ap_uint<filt_width> pixels[min_width];
-    static const unsigned n_partitions = {n_partitions};
-    static const unsigned n_pixels = out_width / n_partitions;
-    template<class data_T, class CONFIG_T>
-    using fill_buffer = nnet::{fill_fn}<data_T, CONFIG_T>;
-    typedef {accum_t.name} accum_t;
-    typedef {bias_t.name} bias_t;
-    typedef {weight_t.name} weight_t;
-    typedef {config_t} mult_config;
-    template<unsigned K, unsigned S, unsigned W>
-    using scale_index = nnet::{scale_index_type}<K, S, W>;
-    template<class data_T, class res_T, class CONFIG_T>
-    using pointwise_conv = nnet::{pointwise_fn}<data_T, res_T, CONFIG_T>;
-}};
-const ap_uint<config{index}::filt_width> config{index}::pixels[] = {{{instructions}}};\n"""
-
 pointwise_conv1d_function_template = (
     'nnet::pointwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
 )
@@ -57,7 +25,7 @@
 class PointwiseConv1DConfigTemplate(Conv1DConfigTemplate):
     def __init__(self):
         super(Conv1DConfigTemplate, self).__init__(PointwiseConv1D)
-        self.template = pointwise_conv1d_config_template
+        self.template = conv1d_config_template
         self.mult_template = conv_mult_config_template
 
 
diff --git a/hls4ml/backends/vivado/passes/pointwise_codegen.py b/hls4ml/backends/vivado/passes/pointwise_codegen.py
index cb26fb6530..763b3e510c 100644
--- a/hls4ml/backends/vivado/passes/pointwise_codegen.py
+++ b/hls4ml/backends/vivado/passes/pointwise_codegen.py
@@ -15,48 +15,48 @@ def generate_pointwise_conv1d_fn(layer_idx, reuse_factor=1):
     """
 
     generated_code = (
-        "template<class data_T, class res_T, typename CONFIG_T>\n"
-        "class pointwise_conv_{index} : public PointwiseConv1D<data_T, res_T, CONFIG_T> {{\n"
-        "  public:\n"
-        "    static void pointwise_conv(\n"
-        "                               data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n"
-        "                               res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n"
-        "                               typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n"
-        "                               typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n"
-        "        data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n"  # noqa: E501
-        "        #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n"
-        "        res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n"  # noqa: E501
-        "        #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n"
-        "    RFInputLoop:\n"
-        "        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n"
-        "        #pragma HLS UNROLL\n"
-        "        InnerInputLoop:\n"
-        "            for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n"
-        "                #pragma HLS UNROLL\n"
-        "                data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n"  # noqa: E501
-        "            }}\n"
-        "        }}\n\n"
+        'template<class data_T, class res_T, typename CONFIG_T>\n'
+        'class pointwise_conv_{index} : public Conv1DKernel<data_T, res_T, CONFIG_T> {{\n'
+        '  public:\n'
+        '    static void conv(\n'
+        '                     data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n'
+        '                     res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n'
+        '                     typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n'
+        '                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n'
+        '        data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n'  # noqa: E501
+        '        #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n'
+        '        res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n'  # noqa: E501
+        '        #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n'
+        '    RFInputLoop:\n'
+        '        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n'
+        '        #pragma HLS UNROLL\n'
+        '        InnerInputLoop:\n'
+        '            for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n'
+        '                #pragma HLS UNROLL\n'
+        '                data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n'  # noqa: E501
+        '            }}\n'
+        '        }}\n\n'
     ).format(index=layer_idx)
-    indent = "        "
+    indent = '        '
     for i in range(reuse_factor):
         generated_code += indent
         generated_code += (
-            f"pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[{i}], res_tmp[{i}], weights, biases);\n"
+            f'pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[{i}], res_tmp[{i}], weights, biases);\n'
         )
 
     generated_code += (
-        "\n"
-        "    RFOutputLoop:\n"
-        "        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n"
-        "        #pragma HLS UNROLL\n"
-        "        InnerOutputLoop:\n"
-        "            for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n"
-        "                #pragma HLS UNROLL\n"
-        "                res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n"  # noqa: E501
-        "            }\n"
-        "        }\n"
-        "    }\n"
-        "};\n"
+        '\n'
+        '    RFOutputLoop:\n'
+        '        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n'
+        '        #pragma HLS UNROLL\n'
+        '        InnerOutputLoop:\n'
+        '            for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n'
+        '                #pragma HLS UNROLL\n'
+        '                res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n'  # noqa: E501
+        '            }\n'
+        '        }\n'
+        '    }\n'
+        '};\n'
     )
 
     return generated_code
@@ -66,14 +66,10 @@ class GeneratePointwiseConv1D(OptimizerPass):
     '''Generates code for pointwise 1D convolution'''
 
     def match(self, node):
-        return isinstance(node, Conv1D) and node.model.config.get_config_value('IOType') == 'io_parallel'
+        return isinstance(node, Conv1D) and node.model.config.get_config_value('IOType') == 'io_parallel' and node.get_attr('filt_width') == 1
 
     def transform(self, model, node):
-        node_class = node.__class__.__name__
-        if '1D' in node_class:
-            self._generate_pointwise_conv1d(node)
-        else:
-            raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})')
+        self._generate_pointwise_conv1d(node)
 
     def _generate_pointwise_conv1d(self, node):
         code_str = generate_pointwise_conv1d_fn(
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
index 92b8571d88..46beeacb03 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
@@ -4,6 +4,7 @@
 #include "nnet_common.h"
 #include "nnet_conv1d_latency.h"
 #include "nnet_conv1d_resource.h"
+#include "nnet_function_stubs.h"
 #include <cstdlib>
 
 namespace nnet {
@@ -38,11 +39,7 @@ void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CO
     // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
     //#pragma HLS INLINE recursive
 
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
+    CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases);
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
@@ -55,13 +52,28 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
     //#pragma HLS INLINE recursive
 
-    if (CONFIG_T::strategy == nnet::latency) {
-        // Use pointwise unrolled implementation
-        CONFIG_T::template pointwise_conv<data_T, res_T, CONFIG_T>::pointwise_conv(data, res, weights, biases);
-    } else {
+    CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> class Conv1DLatency : public Conv1DKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        //#pragma HLS INLINE region
+        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class Conv1DResource : public Conv1DKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        //#pragma HLS INLINE region
         conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
-}
+};
 
 } // namespace nnet
 
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
index b5f29fda06..6006711d8f 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
@@ -107,9 +107,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
     #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
 
     // Limit multipliers to control parallelization
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(
-        CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor);
-#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
 
 // Convolve, saving all multiplication results to accumulate later
 ConvOut:
@@ -159,8 +157,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
     // Cast to "res_t" type
     for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
         for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-            #pragma HLS UNROLL
-            res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]);
+            #pragma HLS UNROLL        
+            res[ii * CONFIG_T::n_filt + ff] = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[ii][ff]);
         }
     }
 }
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
index f0f1c133b9..72bce78067 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
@@ -4,6 +4,7 @@
 #include "nnet_common.h"
 #include "nnet_conv1d_latency.h"
 #include "nnet_conv1d_resource.h"
+#include "nnet_function_stubs.h"
 #include <cstdlib>
 
 namespace nnet {
@@ -37,11 +38,7 @@ void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CO
                 typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     #pragma HLS INLINE region
 
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
+    CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases);
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
@@ -53,13 +50,28 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
 
     #pragma HLS INLINE region
 
-    if (CONFIG_T::strategy == nnet::latency) {
-        // Use pointwise unrolled implementation
-        CONFIG_T::template pointwise_conv<data_T, res_T, CONFIG_T>::pointwise_conv(data, res, weights, biases);
-    } else {
+    CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> class Conv1DLatency : public Conv1DKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        #pragma HLS INLINE region
+        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class Conv1DResource : public Conv1DKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        #pragma HLS INLINE region
         conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
-}
+};
 
 } // namespace nnet
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index b04485af9a..004109d954 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -106,9 +106,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
     #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
 
     // Limit multipliers to control parallelization
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(
-        CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor, CONFIG_T::reuse_factor);
-#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
 
 // Convolve, saving all multiplication results to accumulate later
 ConvOut:
@@ -158,8 +156,8 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
     // Cast to "res_t" type
     for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
         for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-            #pragma HLS UNROLL
-            res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]);
+            #pragma HLS UNROLL        
+            res[ii * CONFIG_T::n_filt + ff] = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[ii][ff]);
         }
     }
 }
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h b/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h
index 1316bbe776..8ce2381e06 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h
@@ -37,6 +37,16 @@ template <class data_T, class res_T, typename CONFIG_T> class DenseKernel {
     }
 };
 
+template <class data_T, class res_T, typename CONFIG_T> class Conv1DKernel {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                     res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        // To be implemented in subclasses
+    }
+};
+
 } // namespace nnet
 
 #endif

From dd021ecd672ca9a08674536f166ad3f149b1f305 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Fri, 22 Nov 2024 02:02:19 -0800
Subject: [PATCH 36/41] fix n_in/n_out

---
 hls4ml/backends/vivado/passes/convolution_templates.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index 551e4f4167..af04df0ec0 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -108,8 +108,8 @@ def format(self, node):
 
         mult_params = self._default_config_params(node)
         if is_pointwise_parallel_latency:
-            mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width') / mult_params['reuse']
-            mult_params['n_out'] = node.get_attr('n_filt') / mult_params['reuse']
+            mult_params['n_in'] = int(node.get_attr('in_width') * node.get_attr('n_chan') * node.get_attr('filt_width') / mult_params['reuse'])
+            mult_params['n_out'] = int(node.get_attr('in_width') * node.get_attr('n_filt') / mult_params['reuse'])
         else:
             mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width')
             mult_params['n_out'] = node.get_attr('n_filt')

From 93acaa6f7849f1a6e33eb3aa90cdc17ba0f1d2c4 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Fri, 22 Nov 2024 08:20:52 -0800
Subject: [PATCH 37/41] pre-commit

---
 hls4ml/backends/vivado/passes/convolution_templates.py | 10 ++++++++--
 hls4ml/backends/vivado/passes/pointwise_codegen.py     |  6 +++++-
 .../templates/vitis/nnet_utils/nnet_conv1d_latency.h   |  2 +-
 .../templates/vivado/nnet_utils/nnet_conv1d_latency.h  |  2 +-
 .../templates/vivado/nnet_utils/nnet_function_stubs.h  |  3 +--
 5 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index af04df0ec0..bd243da290 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -95,7 +95,11 @@ def format(self, node):
         else:
             params['fill_fn'] = 'FillConv1DBuffer'
 
-        is_pointwise_parallel_latency = node.get_attr('filt_width') == 1 and node.get_attr('strategy').lower() == 'latency' and node.model.config.get_config_value('IOType') == 'io_parallel'
+        is_pointwise_parallel_latency = (
+            node.get_attr('filt_width') == 1
+            and node.get_attr('strategy').lower() == 'latency'
+            and node.model.config.get_config_value('IOType') == 'io_parallel'
+        )
         if is_pointwise_parallel_latency:
             params['conv_fn'] = f'pointwise_conv_{node.index}'
         else:
@@ -108,7 +112,9 @@ def format(self, node):
 
         mult_params = self._default_config_params(node)
         if is_pointwise_parallel_latency:
-            mult_params['n_in'] = int(node.get_attr('in_width') * node.get_attr('n_chan') * node.get_attr('filt_width') / mult_params['reuse'])
+            mult_params['n_in'] = int(
+                node.get_attr('in_width') * node.get_attr('n_chan') * node.get_attr('filt_width') / mult_params['reuse']
+            )
             mult_params['n_out'] = int(node.get_attr('in_width') * node.get_attr('n_filt') / mult_params['reuse'])
         else:
             mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width')
diff --git a/hls4ml/backends/vivado/passes/pointwise_codegen.py b/hls4ml/backends/vivado/passes/pointwise_codegen.py
index 763b3e510c..d41d51f82f 100644
--- a/hls4ml/backends/vivado/passes/pointwise_codegen.py
+++ b/hls4ml/backends/vivado/passes/pointwise_codegen.py
@@ -66,7 +66,11 @@ class GeneratePointwiseConv1D(OptimizerPass):
     '''Generates code for pointwise 1D convolution'''
 
     def match(self, node):
-        return isinstance(node, Conv1D) and node.model.config.get_config_value('IOType') == 'io_parallel' and node.get_attr('filt_width') == 1
+        return (
+            isinstance(node, Conv1D)
+            and node.model.config.get_config_value('IOType') == 'io_parallel'
+            and node.get_attr('filt_width') == 1
+        )
 
     def transform(self, model, node):
         self._generate_pointwise_conv1d(node)
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
index 6006711d8f..e166cdd470 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
@@ -157,7 +157,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
     // Cast to "res_t" type
     for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
         for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-            #pragma HLS UNROLL        
+            #pragma HLS UNROLL
             res[ii * CONFIG_T::n_filt + ff] = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[ii][ff]);
         }
     }
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index 004109d954..ef2f94dcaf 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -156,7 +156,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
     // Cast to "res_t" type
     for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
         for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-            #pragma HLS UNROLL        
+            #pragma HLS UNROLL
             res[ii * CONFIG_T::n_filt + ff] = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[ii][ff]);
         }
     }
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h b/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h
index 8ce2381e06..97774bc95b 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h
@@ -39,8 +39,7 @@ template <class data_T, class res_T, typename CONFIG_T> class DenseKernel {
 
 template <class data_T, class res_T, typename CONFIG_T> class Conv1DKernel {
   public:
-    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-                     res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
                      typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
                      typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
         // To be implemented in subclasses

From 1867dfca0fcf7c58547423437cdf10c36e05068e Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Mon, 25 Nov 2024 08:09:42 -0800
Subject: [PATCH 38/41] fix resource strategy

---
 hls4ml/backends/vivado/passes/convolution_templates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index bd243da290..e098107eae 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -105,7 +105,7 @@ def format(self, node):
         else:
             if node.get_attr('strategy').lower() == 'latency':
                 params['conv_fn'] = 'Conv1DLatency'
-            elif node.get_attr('strategy').lower() == 'resource':
+            else:
                 params['conv_fn'] = 'Conv1DResource'
 
         conv_config = self.template.format(**params)

From 76d06e78d6eec022e52022613d06d6c9ece347fc Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 4 Dec 2024 12:56:00 -0600
Subject: [PATCH 39/41] add warning when moving scale fales

---
 hls4ml/model/optimizer/passes/move_scales.py | 30 ++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/hls4ml/model/optimizer/passes/move_scales.py b/hls4ml/model/optimizer/passes/move_scales.py
index 43fcaa0da7..8fba1ec405 100644
--- a/hls4ml/model/optimizer/passes/move_scales.py
+++ b/hls4ml/model/optimizer/passes/move_scales.py
@@ -5,6 +5,8 @@
 
 '''
 
+import warnings
+
 import numpy as np
 
 from hls4ml.model.layers import ApplyAlpha, Constant, Conv, MatMul, Merge
@@ -85,6 +87,9 @@ def transform(self, model, node):
                 can_propagate = False
 
         if not can_propagate:
+            warnings.warn(
+                'Failed to propagate quantization scales down MatMul node; model probably not suppored.', stacklevel=1
+            )
             return False
 
         model.remove_node(apply_alpha)
@@ -124,6 +129,9 @@ def transform(self, model, node):
         try:
             bias = bias0 + bias1
         except ValueError:
+            warnings.warn(
+                'Failed to propagate quantization scales down Add node; model probably not suppored.', stacklevel=1
+            )
             return False
 
         model.remove_node(in0)
@@ -169,6 +177,7 @@ def transform(self, model, node):
             model.insert_node(new_node)
             return True
         else:
+            warnings.warn('Failed to propagate quantization bias down Add node; model probably not suppored.', stacklevel=1)
             return False
 
 
@@ -243,6 +252,9 @@ def transform(self, model, node):
                     except ValueError:
                         can_propagate = False
             if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
                 return False
 
             # to remove warning, since these get set again
@@ -287,6 +299,9 @@ def transform(self, model, node):
                     except ValueError:
                         can_propagate = False
             if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
                 return False
 
             # to remove warning, since these get set again
@@ -308,6 +323,9 @@ def transform(self, model, node):
                     can_propagate = False
 
             if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
                 return False
 
             # to remove warning, since these get set again
@@ -367,6 +385,9 @@ def transform(self, model, node):
                     except ValueError:
                         can_propagate = False
             if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
                 return False
 
             # to remove warning, since these get set again
@@ -388,6 +409,9 @@ def transform(self, model, node):
                 except ValueError:
                     can_propagate = False
             if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
                 return False
 
             # to remove warning, since these get set again
@@ -412,6 +436,9 @@ def transform(self, model, node):
                     except ValueError:
                         can_propagate = False
             if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
                 return False
 
             # to remove warning, since these get set again
@@ -445,6 +472,9 @@ def transform(self, model, node):
                     except ValueError:
                         can_propagate = False
             if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
                 return False
 
             # to remove warning, since these get set again

From 915d2e134337451cf473147328796fb15854e67a Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 4 Dec 2024 12:57:28 -0600
Subject: [PATCH 40/41] better handle cases when there is no previous node

---
 hls4ml/model/optimizer/passes/batchnorm_opt.py | 4 ++--
 hls4ml/model/optimizer/passes/bn_fuse.py       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py
index e18d79ff4a..26b7b18e38 100644
--- a/hls4ml/model/optimizer/passes/batchnorm_opt.py
+++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py
@@ -166,7 +166,7 @@ class FuseConsecutiveBatchNormalization(OptimizerPass):
     """
 
     def match(self, node):
-        prev_node = node.get_input_node(node.inputs[0])
+        prev_node = node.get_input_node()
         basic_match = (
             isinstance(node, BatchNormalization)
             and isinstance(prev_node, BatchNormalization)
@@ -194,7 +194,7 @@ def match(self, node):
             return False
 
     def transform(self, model, node):
-        prev_node = node.get_input_node(node.inputs[0])
+        prev_node = node.get_input_node()
 
         prev_map = prev_node.get_output_use_map()
         if len(prev_map[prev_node.outputs[0]]) > 1:
diff --git a/hls4ml/model/optimizer/passes/bn_fuse.py b/hls4ml/model/optimizer/passes/bn_fuse.py
index 000d8380ce..be81d5fb3d 100644
--- a/hls4ml/model/optimizer/passes/bn_fuse.py
+++ b/hls4ml/model/optimizer/passes/bn_fuse.py
@@ -18,7 +18,7 @@ class FuseBatchNormalization(OptimizerPass):
     """
 
     def match(self, node):
-        prev_node = node.get_input_node(node.inputs[0])
+        prev_node = node.get_input_node()
         basic_match = (
             isinstance(node, BatchNormalization)
             and isinstance(prev_node, (Dense, Conv1D, Conv2D))

From 88c1fe76418e79cf400acd6ef211a20ed2717f42 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <59868635+bo3z@users.noreply.github.com>
Date: Wed, 4 Dec 2024 22:52:02 +0100
Subject: [PATCH 41/41] Minor doc improvements to attributes (#57)

* Minor doc improvements to attributes

* Minor fixes
---
 hls4ml/utils/attribute_descriptions.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/hls4ml/utils/attribute_descriptions.py b/hls4ml/utils/attribute_descriptions.py
index 92cee9e791..756f276fa1 100644
--- a/hls4ml/utils/attribute_descriptions.py
+++ b/hls4ml/utils/attribute_descriptions.py
@@ -5,10 +5,11 @@
 reuse_factor = (
     'The number of times each multiplier is used by controlling the amount of pipelining/unrolling. '
     'Lower number results in more parallelism and lower latency at the expense of the resources used.'
+    'Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.'
 )
 
 index = 'Internal node counter used for bookkeeping and variable/tensor naming.'
-trace = 'Enables saving of layer output (tracing).'
+trace = 'Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)'
 
 result_type = 'The datatype (precision) of the output tensor.'
 accum_type = 'The datatype (precision) used to store intermediate results of the computation within the layer.'
@@ -35,8 +36,12 @@
     'The number of outputs computed in parallel. Essentially the number of multiplications of input window with the '
     'convolution kernel occuring in parallel. '
     'Higher number results in more parallelism (lower latency and II) at the expense of resources used.'
+    'Currently only supported in io_parallel.'
+)
+conv_implementation = (
+    '"LineBuffer" implementation is preferred over "Encoded" for most use cases. '
+    'This attribute only applies to io_stream.'
 )
-conv_implementation = '"LineBuffer" implementation is preferred over "Encoded" for most use cases.'
 
 # Recurrent-related attributes