Merge branch 'main' of github.com:pulp-platform/pulp-nnx into fconti/…

…neureka
pulp-platform · Feb 22, 2024 · 99cc182 · 99cc182
2 parents bbd4b7a + b4d7cd4
commit 99cc182
Showing 15 changed files with 216 additions and 102 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,12 +8,14 @@
 - Support for kernels without normalization and quantization for NE16
 - isort check
 - publication citation
+- support 32bit scale
 
 ### Changed
 
 - `ne16_task_init` got split into smaller parts: `ne16_task_init`, `ne16_task_set_op_to_conv`, `ne16_task_set_weight_offset`, `ne16_task_set_bits`, `ne16_task_set_norm_quant`
 - strides in `ne16_task_set_strides`, `ne16_task_set_dims`, and `ne16_task_set_ptrs` are now strides between consecutive elements in that dimension
 - `ne16_task_queue_size` is now `NE16_TASK_QUEUE_SIZE`
+- `ne16_task_set_ptrs` split into `ne16_task_set_ptrs_conv` and `ne16_task_set_ptrs_norm_quant`
 
 ### Removed
 

diff --git a/ne16/README.md b/ne16/README.md
@@ -28,7 +28,7 @@
 - [ ] Scale type
     - [x] uint8
     - [ ] uint16
-    - [ ] uint32
+    - [x] uint32
 - [x] Bias type
     - [x] int32
 - [ ] Weight type

diff --git a/ne16/hal/ne16_task.c b/ne16/hal/ne16_task.c
@@ -113,15 +113,18 @@ uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width, uint32_t width_stride,
   return ptr - (padding_top * width + padding_left) * width_stride;
 }
 
-void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in,
-                        uint32_t w_in_stride, uint8_t padding_top,
-                        uint8_t padding_left, uint32_t output_ptr,
-                        uint32_t weights_ptr, uint32_t scale_ptr,
-                        uint32_t shift_ptr, uint32_t bias_ptr) {
+void ne16_task_set_ptrs_conv(ne16_task_t *task, uint32_t input_ptr,
+                             uint32_t w_in, uint32_t w_in_stride,
+                             uint8_t padding_top, uint8_t padding_left,
+                             uint32_t output_ptr, uint32_t weights_ptr) {
   task->data.infeat_ptr =
       ne16_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left);
   task->data.outfeat_ptr = output_ptr;
   task->data.weights_ptr = weights_ptr;
+}
+
+void ne16_task_set_ptrs_norm_quant(ne16_task_t *task, uint32_t scale_ptr,
+                                   uint32_t shift_ptr, uint32_t bias_ptr) {
   task->data.scale_ptr = scale_ptr;
   task->data.scale_shift_ptr = shift_ptr;
   task->data.scale_bias_ptr = bias_ptr;
@@ -206,8 +209,8 @@ void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
 }
 
 void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
-                               const uint8_t right, const uint8_t bottom,
-                               const uint8_t left) {
+                               const uint8_t bottom, const uint8_t left,
+                               const uint8_t right) {
   task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
                                ((bottom & 0xff) << 8) | ((left & 0xff) << 0);
 }
@@ -219,8 +222,8 @@ void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
                         const uint32_t h_out_stride,
                         const uint32_t w_out_stride, const uint8_t padding_top,
                         const uint8_t padding_bottom,
-                        const uint8_t padding_right,
-                        const uint8_t padding_left) {
+                        const uint8_t padding_left,
+                        const uint8_t padding_right) {
   ne16_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride,
                         w_out_stride);
   ne16_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,
@@ -235,8 +238,8 @@ void ne16_task_set_dims_stride2x2(
     const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
     const uint32_t h_out_stride, const uint32_t w_out_stride,
     const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
-    const uint8_t padding_bottom, const uint8_t padding_right,
-    const uint8_t padding_left) {
+    const uint8_t padding_bottom, const uint8_t padding_left,
+    const uint8_t padding_right) {
   const uint8_t stride = 2;
 
   // WARNING: works only for even output channel stride (divisible by 2)

diff --git a/ne16/hal/ne16_task.h b/ne16/hal/ne16_task.h
@@ -42,8 +42,8 @@ typedef enum {
 
 typedef struct ne16_norm_t {
   ne16_norm_mode_e mode;
-  int flag_bias;
-  int flag_shift;
+  ne16_task_flag_e flag_bias;
+  ne16_task_flag_e flag_shift;
 } ne16_norm_t;
 
 typedef enum ne16_quant_mode_e {
@@ -59,9 +59,9 @@ typedef enum ne16_quant_function_e {
 
 typedef struct ne16_quant_t {
   // Shift amount must be in range 0x00-0x1F
-  unsigned shift_amount;
+  uint8_t shift_amount;
   ne16_quant_function_e function;
-  int flag_rounding;
+  ne16_task_flag_e flag_rounding;
 } ne16_quant_t;
 
 typedef struct ne16_stride_t {
@@ -133,11 +133,12 @@ uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
 uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width,
                       const uint32_t width_stride, const uint8_t padding_top,
                       const uint8_t padding_left);
-void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in,
-                        uint32_t w_in_stride, uint8_t padding_top,
-                        uint8_t padding_left, uint32_t output_ptr,
-                        uint32_t weights_ptr, uint32_t scale_ptr,
-                        uint32_t shift_ptr, uint32_t bias_ptr);
+void ne16_task_set_ptrs_conv(ne16_task_t *task, uint32_t input_ptr,
+                             uint32_t w_in, uint32_t w_in_stride,
+                             uint8_t padding_top, uint8_t padding_left,
+                             uint32_t output_ptr, uint32_t weights_ptr);
+void ne16_task_set_ptrs_norm_quant(ne16_task_t *task, uint32_t scale_ptr,
+                                   uint32_t shift_ptr, uint32_t bias_ptr);
 /** ne16_task_set_strides
  *
  * All the strides variables are strides between elements alongside that
@@ -157,8 +158,8 @@ void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
                            const uint8_t bottom, const uint8_t left,
                            const uint8_t right, const uint8_t value);
 void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
-                               const uint8_t right, const uint8_t bottom,
-                               const uint8_t left);
+                               const uint8_t bottom, const uint8_t left,
+                               const uint8_t right);
 /** ne16_task_set_dims
  *
  * All the strides variables are strides between elements alongside that
@@ -172,8 +173,8 @@ void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
                         const uint32_t h_out_stride,
                         const uint32_t w_out_stride, const uint8_t padding_top,
                         const uint8_t padding_bottom,
-                        const uint8_t padding_right,
-                        const uint8_t padding_left);
+                        const uint8_t padding_left,
+                        const uint8_t padding_right);
 /** ne16_task_set_dims_stride2x2
  *
  * All the strides variables are strides between elements alongside that
@@ -186,7 +187,7 @@ void ne16_task_set_dims_stride2x2(
     const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
     const uint32_t h_out_stride, const uint32_t w_out_stride,
     const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
-    const uint8_t padding_bottom, const uint8_t padding_right,
-    const uint8_t padding_left);
+    const uint8_t padding_bottom, const uint8_t padding_left,
+    const uint8_t padding_right);
 
 #endif // !__NE16_TASK_H__
diff --git a/neureka/README.md b/neureka/README.md
@@ -16,17 +16,16 @@ Github repo [link](https://github.com/siracusa-soc/ne).
     - [x] Bias (w/ and w/o)
     - [ ] Per-channel shift
     - [x] Per-layer shift
-    - [ ] Rounding
 - [x] Input type
     - [x] uint8
     - [x] int8
 - [x] Output type
     - [x] int8
     - [x] uint8 (only w/ Relu)
     - [x] int32
-- [ ] Scale type
+- [x] Scale type
     - [x] uint8
-    - [ ] uint32
+    - [x] uint32
 - [x] Bias type
     - [x] int32
 - [ ] Weight type

diff --git a/neureka/bsp/siracusa/neureka_siracusa_bsp.h b/neureka/bsp/siracusa/neureka_siracusa_bsp.h
@@ -18,8 +18,8 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#ifndef __NEUREKA_siracusa_BSP_H__
-#define __NEUREKA_siracusa_BSP_H__
+#ifndef __NEUREKA_SIRACUSA_BSP_H__
+#define __NEUREKA_SIRACUSA_BSP_H__
 
 #include "neureka.h"
 #include <stdint.h>
@@ -64,4 +64,4 @@ void neureka_siracusa_close();
 void neureka_siracusa_event_wait_and_clear();
 const neureka_dev_t *neureka_siracusa_get_dev();
 
-#endif // !__NEUREKA_siracusa_BSP_H__
+#endif // !__NEUREKA_SIRACUSA_BSP_H__
diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
@@ -47,8 +47,7 @@ void neureka_task_init(neureka_task_t *task) {
 
 void neureka_task_set_op_to_conv(neureka_task_t *task,
                                  const uint8_t kernel_shape,
-                                 const uint8_t depthwise,
-                                 const uint8_t stride) {
+                                 const uint8_t depthwise) {
   task->depthwise = depthwise;
   task->kernel_shape = kernel_shape;
   task->subtile_output_channel = depthwise ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3
@@ -133,16 +132,18 @@ uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
   return ptr - (padding_top * width + padding_left) * width_stride;
 }
 
-void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
-                           uint32_t w_in, uint32_t w_in_stride,
-                           uint8_t padding_top, uint8_t padding_left,
-                           uint32_t output_ptr, uint32_t weights_ptr,
-                           uint32_t scale_ptr, uint32_t shift_ptr,
-                           uint32_t bias_ptr) {
+void neureka_task_set_ptrs_conv(neureka_task_t *task, uint32_t input_ptr,
+                                uint32_t w_in, uint32_t w_in_stride,
+                                uint8_t padding_top, uint8_t padding_left,
+                                uint32_t output_ptr, uint32_t weights_ptr) {
   task->data.infeat_ptr =
       neureka_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left);
   task->data.outfeat_ptr = output_ptr;
   task->data.weights_ptr = weights_ptr;
+}
+
+void neureka_task_set_ptrs_norm_quant(neureka_task_t *task, uint32_t scale_ptr,
+                                      uint32_t shift_ptr, uint32_t bias_ptr) {
   task->data.scale_ptr = scale_ptr;
   task->data.scale_shift_ptr = shift_ptr;
   task->data.scale_bias_ptr = bias_ptr;
@@ -223,8 +224,8 @@ void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
 }
 
 void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
-                                  const uint8_t right, const uint8_t bottom,
-                                  const uint8_t left) {
+                                  const uint8_t bottom, const uint8_t left,
+                                  const uint8_t right) {
   task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
                                ((bottom & 0xff) << 8) | ((left & 0xff) << 0);
 }
@@ -235,7 +236,7 @@ void neureka_task_set_dims(
     const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
     const uint32_t h_out_stride, const uint32_t w_out_stride,
     const uint8_t padding_top, const uint8_t padding_bottom,
-    const uint8_t padding_right, const uint8_t padding_left) {
+    const uint8_t padding_left, const uint8_t padding_right) {
   neureka_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride,
                            w_out_stride);
   neureka_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,

diff --git a/neureka/hal/neureka_task.h b/neureka/hal/neureka_task.h
@@ -51,8 +51,8 @@ typedef enum {
 
 typedef struct neureka_norm_t {
   neureka_norm_mode_e mode;
-  int flag_bias;
-  int flag_shift;
+  neureka_task_flag_e flag_bias;
+  neureka_task_flag_e flag_shift;
 } neureka_norm_t;
 
 typedef enum neureka_quant_mode_e {
@@ -67,9 +67,9 @@ typedef enum neureka_quant_function_e {
 
 typedef struct neureka_quant_t {
   // Shift amount must be in range 0x00-0x1F
-  unsigned shift_amount;
+  uint8_t shift_amount;
   neureka_quant_function_e function;
-  int flag_rounding;
+  neureka_task_flag_e flag_rounding;
 } neureka_quant_t;
 
 typedef struct neureka_stride_t {
@@ -128,7 +128,7 @@ typedef struct neureka_task_t {
 void neureka_task_init(neureka_task_t *task);
 void neureka_task_set_op_to_conv(neureka_task_t *task,
                                  const uint8_t kernel_shape,
-                                 const uint8_t depthwise, const uint8_t stride);
+                                 const uint8_t depthwise);
 void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits,
                            const uint8_t output_bits,
                            const uint8_t weight_bits);
@@ -147,12 +147,12 @@ uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
 uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
                          const uint32_t width_stride, const uint8_t padding_top,
                          const uint8_t padding_left);
-void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
-                           uint32_t w_in, uint32_t w_in_stride,
-                           uint8_t padding_top, uint8_t padding_left,
-                           uint32_t output_ptr, uint32_t weights_ptr,
-                           uint32_t scale_ptr, uint32_t shift_ptr,
-                           uint32_t bias_ptr);
+void neureka_task_set_ptrs_conv(neureka_task_t *task, uint32_t input_ptr,
+                                uint32_t w_in, uint32_t w_in_stride,
+                                uint8_t padding_top, uint8_t padding_left,
+                                uint32_t output_ptr, uint32_t weights_ptr);
+void neureka_task_set_ptrs_norm_quant(neureka_task_t *task, uint32_t scale_ptr,
+                                      uint32_t shift_ptr, uint32_t bias_ptr);
 /** neureka_task_set_strides
  *
  * All the strides variables are strides between elements alongside that
@@ -173,8 +173,8 @@ void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
                               const uint8_t bottom, const uint8_t left,
                               const uint8_t right, const uint8_t value);
 void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
-                                  const uint8_t right, const uint8_t bottom,
-                                  const uint8_t left);
+                                  const uint8_t bottom, const uint8_t left,
+                                  const uint8_t right);
 /** neureka_task_set_dims
  *
  * All the strides variables are strides between elements alongside that
@@ -187,6 +187,6 @@ void neureka_task_set_dims(
     const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
     const uint32_t h_out_stride, const uint32_t w_out_stride,
     const uint8_t padding_top, const uint8_t padding_bottom,
-    const uint8_t padding_right, const uint8_t padding_left);
+    const uint8_t padding_left, const uint8_t padding_right);
 
 #endif // !__NEUREKA_TASK_H__
diff --git a/test/NeuralEngineFunctionalModel.py b/test/NeuralEngineFunctionalModel.py
@@ -28,24 +28,34 @@ def _norm_quant(
         bias_type: Optional[IntegerType],
         has_bias: bool,
         has_relu: bool,
+        verbose: bool,
     ) -> torch.Tensor:
         # Scale accumulators are in 48bit, so keeping the data in 64bit
         tensor = tensor * scale
         assert tensor.dtype == torch.int64
 
+        if verbose:
+            print("INTERMEDIATE RESULTS (after scale):")
+            print(tensor)
+
         if has_bias:
             assert bias is not None
             assert bias_type is not None
-            # Saturating cast to int32
+
             tensor = NeuralEngineFunctionalModel._cast(
-                tensor, bias_type, saturate=True
+                tensor, bias_type, saturate=False
             ).type(torch.int32)
 
             tensor = tensor + bias
+
             tensor = NeuralEngineFunctionalModel._cast(
-                tensor, bias_type, saturate=False
+                tensor, bias_type, saturate=True
             ).type(torch.int32)
 
+            if verbose:
+                print("INTERMEDIATE RESULTS (after bias):")
+                print(tensor)
+
         if has_relu:
             tensor = F.relu(tensor)
 
@@ -118,6 +128,7 @@ def convolution(
                 bias_type,
                 has_bias,
                 has_relu,
+                verbose,
             )
 
         return output
diff --git a/test/NeurekaMemoryLayout.py b/test/NeurekaMemoryLayout.py
@@ -20,8 +20,6 @@
 import numpy as np
 import numpy.typing as npt
 
-from TestClasses import IntegerType
-
 
 class NeurekaMemoryLayout:
     _WEIGHT_BANDWIDTH = 256

diff --git a/test/NnxTestClasses.py b/test/NnxTestClasses.py
@@ -254,16 +254,22 @@ def from_conf(
                 ).type(torch.int32)
             if global_shift is None:
                 global_shift = torch.Tensor([0]).type(torch.int32)
+                conv_kwargs = {
+                    **conf.__dict__,
+                    "out_type": NeuralEngineFunctionalModel.ACCUMULATOR_TYPE,
+                }
                 output = NeuralEngineFunctionalModel().convolution(
                     input,
                     weight,
                     scale,
                     bias,
                     global_shift,
-                    verbose=verbose,
-                    **conf.__dict__,
+                    verbose=False,
+                    **conv_kwargs,
+                )
+                global_shift = NnxTestGenerator._calculate_global_shift(
+                    output, conf.out_type
                 )
-                NnxTestGenerator._calculate_global_shift(output, conf.out_type)
 
         output = NeuralEngineFunctionalModel().convolution(
             input, weight, scale, bias, global_shift, verbose=verbose, **conf.__dict__