fastmachinelearning · JanFSchulte · Oct 22, 2024 · May 26, 2023 · May 26, 2023 · May 29, 2023
diff --git a/docs/advanced/model_optimization.rst b/docs/advanced/model_optimization.rst
@@ -13,11 +13,11 @@ The code block below showcases three use cases of the hls4ml Optimization API -
     from tensorflow.keras.optimizers import Adam
     from tensorflow.keras.metrics import CategoricalAccuracy
     from tensorflow.keras.losses import CategoricalCrossentropy
-    from hls4ml.optimization.keras import optimize_model
-    from hls4ml.optimization.keras.utils import get_model_sparsity
-    from hls4ml.optimization.attributes import get_attributes_from_keras_model
-    from hls4ml.optimization.objectives import ParameterEstimator
-    from hls4ml.optimization.scheduler import PolynomialScheduler
+    from hls4ml.optimization.dsp_aware_pruning.keras import optimize_model
+    from hls4ml.optimization.dsp_aware_pruning.keras.utils import get_model_sparsity
+    from hls4ml.optimization.dsp_aware_pruning.attributes import get_attributes_from_keras_model
+    from hls4ml.optimization.dsp_aware_pruning.objectives import ParameterEstimator
+    from hls4ml.optimization.dsp_aware_pruning.scheduler import PolynomialScheduler
     # Define baseline model and load data
     # X_train, y_train = ...
     # X_val, y_val = ...
@@ -75,7 +75,7 @@ To optimize GPU FLOPs, the code is similar to above:
 
 .. code-block:: Python
 
-    from hls4ml.optimization.objectives.gpu_objectives import GPUFLOPEstimator
+    from hls4ml.optimization.dsp_aware_pruning.objectives.gpu_objectives import GPUFLOPEstimator
 
     # Optimize model
     # Note the change from ParameterEstimator to GPUFLOPEstimator
@@ -98,7 +98,7 @@ Finally, optimizing Vivado DSPs is possible, given a hls4ml config:
 .. code-block:: Python
 
     from hls4ml.utils.config import config_from_keras_model
-    from hls4ml.optimization.objectives.vivado_objectives import VivadoDSPEstimator
+    from hls4ml.optimization.dsp_aware_pruning.objectives.vivado_objectives import VivadoDSPEstimator
 
     # Note the change from optimize_model to optimize_keras_model_for_hls4ml
     # The function optimize_keras_model_for_hls4ml acts as a wrapper for the function, parsing hls4ml config to model attributes
@@ -130,5 +130,5 @@ Note, to ensure DSPs are optimized, "unrolled" Dense multiplication must be used
 .. code-block:: Python
 
     hls_config = config_from_keras_model(optimized_model)
-    hls_config['Model']['DenseResourceImplementation'] = 'Unrolled'
-    # Any addition hls4ml config, such as strategy, reuse factor etc...
+    hls_config['Model']['Strategy'] = 'Unrolled'
+    # Any addition hls4ml config, reuse factor etc...
diff --git a/docs/api/configuration.rst b/docs/api/configuration.rst
@@ -135,7 +135,10 @@ For Vivado backend the options are:
 * **IOType**\ : your options are ``io_parallel`` or ``io_stream`` which defines the type of data structure used for inputs, intermediate activations between layers, and outputs. For ``io_parallel``, arrays are used that, in principle, can be fully unrolled and are typically implemented in RAMs. For ``io_stream``, HLS streams are used, which are a more efficient/scalable mechanism to represent data that are produced and consumed in a sequential manner. Typically, HLS streams are implemented with FIFOs instead of RAMs. For more information see `here <https://docs.xilinx.com/r/en-US/ug1399-vitis-hls/pragma-HLS-stream>`__.
 * **HLSConfig**\: the detailed configuration of precision and parallelism, including:
   * **ReuseFactor**\ : in the case that you are pipelining, this defines the pipeline interval or initiation interval
-  * **Strategy**\ : Optimization strategy on FPGA, either "Latency" or "Resource". If none is supplied then hl4ml uses "Latency" as default. Note that a reuse factor larger than 1 should be specified when using "resource" strategy. An example of using larger reuse factor can be found `here. <https://github.com/fastmachinelearning/models/tree/master/keras/KERAS_dense>`__
+  * **ParallelizationFactor**\ : The number of output "pixels" to compute in parallel in convolutional layers. Increasing this parameter results in significant increase in resources required on the FPGA.
+  * **Strategy**\ : Optimization strategy on FPGA, either "Latency", "Resource" or "Unrolled". If none is supplied then hl4ml uses "Latency" as default. Note that a reuse factor larger than 1 should be specified when using "resource" or "unrolled" strategy. An example of using larger reuse factor can be found `here. <https://github.com/fastmachinelearning/models/tree/master/keras/KERAS_dense>`__
+  * **PipelineStyle**\ : Set the top level pipeline style. Valid options are "auto", "pipeline" and "dataflow". If unspecified, it defaults to "auto".
+  * **PipelineInterval**\ : Optionally override the desired initiation interval of the design. Only valid in combination with "pipeline" style. If unspecified, it is left to the compiler to decide, ideally matching the largest reuse factor of the network.
   * **Precision**\ : this defines the precsion of your inputs, outputs, weights and biases. It is denoted by ``ap_fixed<X,Y>``\ , where ``Y`` is the number of bits representing the signed number above the binary point (i.e. the integer part), and ``X`` is the total number of bits.
   Additionally, integers in fixed precision data type (\ ``ap_int<N>``\ , where ``N`` is a bit-size from 1 to 1024) can also be used. You have a chance to further configure this more finely with per-layer configuration described below.
 

diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
@@ -238,10 +238,12 @@ def get_closest_reuse_factor(self, valid_rf, chosen_rf):
         else:
             return before
 
-    def set_closest_reuse_factor(self, layer, n_in, n_out, attribute='reuse_factor'):
+    def set_closest_reuse_factor(self, layer, n_in, n_out, attribute='reuse_factor', include_max_rf=True):
         assert attribute is not None, 'Reuse factor attribute cannot be None'
 
         valid_rf = self.get_valid_reuse_factors(n_in, n_out)
+        if not include_max_rf:
+            valid_rf.pop()
         chosen_rf = layer.get_attr(attribute)
         if chosen_rf not in valid_rf:
             closest_rf = self.get_closest_reuse_factor(valid_rf, chosen_rf)

diff --git a/hls4ml/backends/vitis/passes/feature_check.py b/hls4ml/backends/vitis/passes/feature_check.py
@@ -14,7 +14,7 @@ def transform(self, model, node):
             node.set_attr('implementation', 'linebuffer')
 
 
-class ValidateStrategy(OptimizerPass):
+class ValidateResourceStrategy(OptimizerPass):
     _resource_layer_cls = ['Conv1D', 'Conv2D', 'Dense']
 
     def match(self, node):
@@ -29,6 +29,23 @@ def transform(self, model, node):
         if rf > n_in and rf % n_in > 0:
             print(
                 f'WARNING: "Resource" strategy in "{node.name}" ({node.class_name}) may have suboptimal QoR in Vitis '
-                'backend due to use of "urem" cores.\n'
-                'Consider using a different ReuseFactor or switching to "Latency" strategy.'
+                'backend due to use of "urem" cores in Vitis HLS <= 2022.1.\n'
+                'Consider using a different ReuseFactor or switching to "Latency" strategy if using older versions '
+                'of Vitis HLS.'
             )
+
+
+class ValidateResourceUnrolledStrategy(OptimizerPass):
+    _unrolled_layer_cls = ['Conv1D', 'Conv2D', 'Dense', 'GRU', 'LSTM']
+
+    def match(self, node):
+        is_unrolled_layer = len([layer_cls for layer_cls in self._unrolled_layer_cls if layer_cls in node.class_name]) > 0
+        is_unrolled_strategy = node.get_attr('strategy', 'latency').lower() == 'resource_unrolled'
+
+        return is_unrolled_layer and is_unrolled_strategy
+
+    def transform(self, model, node):
+        print(
+            f'WARNING: "ResourceUnrolled" strategy in "{node.name}" ({node.class_name}) may have unexpected II in'
+            'Vitis backend.\nVerify that the final design satisfies the latency/II constraints.'
+        )
diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
@@ -15,7 +15,8 @@ def __init__(self):
     def _register_flows(self):
         validation_passes = [
             'vitis:validate_conv_implementation',
-            'vitis:validate_strategy',
+            'vitis:validate_resource_strategy',
+            'vitis:validate_resource_unrolled_strategy',
         ]
         validation_flow = register_flow('validation', validation_passes, requires=['vivado:init_layers'], backend=self.name)
 

diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -22,6 +22,8 @@
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;
     typedef {weight_t.name} weight_t;
+    template<class data_T, class res_T, class CONFIG_T>
+    using kernel = nnet::{dense_function}<data_T, res_T, CONFIG_T>;
     template<class x_T, class y_T>
     using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
@@ -100,6 +102,18 @@ def format(self, node):
         mult_params['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
+
+        if node.get_attr('strategy').lower() == 'latency':
+            mult_params['dense_function'] = 'DenseLatency'
+        elif node.get_attr('strategy').lower() == 'resource':
+            if int(mult_params['reuse_factor']) <= int(mult_params['n_in']):
+                mult_params['dense_function'] = 'DenseResource_rf_leq_nin'
+            else:
+                mult_params['dense_function'] = 'DenseResource_rf_gt_nin_rem0'
+            # The 3rd case is never used
+        elif node.get_attr('strategy').lower() == 'resource_unrolled':
+            mult_params['dense_function'] = f'dense_resource_unrolled_{node.index}'
+
         mult_config = self.mult_template.format(**mult_params)
 
         return mult_config + '\n' + conv_config
@@ -213,6 +227,18 @@ def format(self, node):
         mult_params['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
+
+        if node.get_attr('strategy').lower() == 'latency':
+            mult_params['dense_function'] = 'DenseLatency'
+        elif node.get_attr('strategy').lower() == 'resource':
+            if int(mult_params['reuse_factor']) <= int(mult_params['n_in']):
+                mult_params['dense_function'] = 'DenseResource_rf_leq_nin'
+            else:
+                mult_params['dense_function'] = 'DenseResource_rf_gt_nin_rem0'
+            # The 3rd case is never used
+        elif node.get_attr('strategy').lower() == 'resource_unrolled':
+            mult_params['dense_function'] = f'dense_resource_unrolled_{node.index}'
+
         mult_config = self.mult_template.format(**mult_params)
 
         return mult_config + '\n' + conv_config
@@ -297,6 +323,8 @@ def format(self, node):
             params['scale_index_type'] = 'scale_index_regular'
 
         params['config_t'] = f'config{node.index}_depthwise_mult'
+        # TODO - Extend unrolled Dense Resource
+        params['unrolled_function'] = 'DenseResourceUnrolled'
         depthwise_config = self.depthwise_template.format(**params)
 
         # Depthwise mult config
@@ -309,6 +337,9 @@ def format(self, node):
         mult_params['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('depthwise').type.precision
         )
+        # TODO - Extend unrolled Dense Resource to depthwise Conv1D
+        mult_params['unrolled_function'] = 'DenseResourceUnrolled'
+
         depthwise_mult_config = self.depthwise_mult_template.format(**mult_params)
 
         # Pointwise config
@@ -338,6 +369,8 @@ def format(self, node):
             params['scale_index_type'] = 'scale_index_regular'
 
         params['config_t'] = f'config{node.index}_pointwise_mult'
+        # TODO - Extend unrolled Dense Resource
+        params['unrolled_function'] = 'DenseResourceUnrolled'
         pointwise_config = self.pointwise_template.format(**params)
 
         # Pointwise mult config
@@ -350,6 +383,9 @@ def format(self, node):
         mult_params['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('pointwise').type.precision
         )
+        # TODO - Extend unrolled Dense Resource to separable Conv1D
+        mult_params['unrolled_function'] = 'DenseResourceUnrolled'
+
         pointwise_mult_config = self.pointwise_mult_template.format(**mult_params)
 
         return (
@@ -425,6 +461,8 @@ def format(self, node):
             params['scale_index_width_type'] = 'scale_index_regular'
 
         params['config_t'] = f'config{node.index}_depthwise_mult'
+        # TODO - Extend unrolled Dense Resource
+        params['unrolled_function'] = 'DenseResourceUnrolled'
         depthwise_config = self.depthwise_template.format(**params)
 
         # Depthwise mult config
@@ -437,6 +475,8 @@ def format(self, node):
         mult_params['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('depthwise').type.precision
         )
+        # TODO - Extend unrolled Dense Resource to depthwise Conv2D
+        mult_params['unrolled_function'] = 'DenseResourceUnrolled'
         depthwise_mult_config = self.depthwise_mult_template.format(**mult_params)
 
         # Pointwise config
@@ -474,6 +514,8 @@ def format(self, node):
         else:
             params['scale_index_width_type'] = 'scale_index_regular'
         params['config_t'] = f'config{node.index}_pointwise_mult'
+        # TODO - Extend unrolled Dense Resource
+        params['unrolled_function'] = 'DenseResourceUnrolled'
         pointwise_config = self.pointwise_template.format(**params)
 
         # Pointwise mult config
@@ -486,6 +528,8 @@ def format(self, node):
         mult_params['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('pointwise').type.precision
         )
+        # TODO - Extend unrolled Dense Resource to separable Conv2D
+        mult_params['unrolled_function'] = 'DenseResourceUnrolled'
         pointwise_mult_config = self.pointwise_mult_template.format(**mult_params)
 
         return (

diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
@@ -19,6 +19,8 @@
     typedef {bias_t.name} bias_t;
     typedef {weight_t.name} weight_t;
     typedef {index_t.name} index_t;
+    template<class data_T, class res_T, class CONFIG_T>
+    using kernel = nnet::{dense_function}<data_T, res_T, CONFIG_T>;
     template<class x_T, class y_T>
     using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
@@ -41,6 +43,17 @@ def format(self, node):
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
 
+        if node.get_attr('strategy').lower() == 'latency':
+            params['dense_function'] = 'DenseLatency'
+        elif node.get_attr('strategy').lower() == 'resource':
+            if int(params['reuse_factor']) <= int(params['n_in']):
+                params['dense_function'] = 'DenseResource_rf_leq_nin'
+            else:
+                params['dense_function'] = 'DenseResource_rf_gt_nin_rem0'
+            # The 3rd case is never used
+        elif node.get_attr('strategy').lower() == 'resource_unrolled':
+            params['dense_function'] = f'dense_resource_unrolled_{node.index}'
+
         return self.template.format(**params)