From a6f142a9fbe4b0c65628d1f984c9dabe64dc3d5c Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Sat, 27 Jan 2024 15:14:19 +0100 Subject: [PATCH] Add no norm_quant to neureka and all the fixes too --- neureka/hal/neureka_task.c | 178 +++++++++++++++++++------------- neureka/hal/neureka_task.h | 58 +++++++---- neureka/hal/neureka_task_defs.h | 48 +++++---- test/NeurekaTestConf.py | 2 +- test/app/src/nnx_layer.c | 37 +++++-- 5 files changed, 206 insertions(+), 117 deletions(-) diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c index 2c8823c..35d0745 100644 --- a/neureka/hal/neureka_task.c +++ b/neureka/hal/neureka_task.c @@ -23,8 +23,8 @@ #include "pulp_nnx_util.h" uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height, - uint32_t i_width, uint32_t n_height, - uint32_t n_width) { + uint32_t i_width, uint32_t n_height, + uint32_t n_width) { uint32_t tile_padding = padding; if (i_height > 0) { tile_padding &= ~(0xf << 28); @@ -41,40 +41,78 @@ uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height, return tile_padding; } -void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape, - const uint8_t depthwise, const uint8_t input_bits, - const uint8_t output_bits, const uint8_t weights_bits, - const neureka_weight_offset_mode_e weights_offset_mode, - const uint32_t weights_offset_factor, - neureka_quant_t quant, neureka_norm_t norm, - const uint8_t flag_input_signed) { - *task = (neureka_task_t){.outbytes = output_bits / 8, - .qw = weights_bits, - .output_channel_throughput = - depthwise ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 - : NEUREKA_OUTPUT_CHANNEL_THROUGHPUT, - .input_channel_throughput = - kernel_shape == 3 - ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 - : NEUREKA_INPUT_CHANNEL_THROUGHPUT_1x1, - .kernel_shape = kernel_shape, - .depthwise = depthwise, - .data = {0}}; +void neureka_task_init(neureka_task_t *task) { + *task = (neureka_task_t){.data = {0}}; +} + +void neureka_task_set_op_to_conv(neureka_task_t *task, + const uint8_t kernel_shape, + const uint8_t depthwise, + const uint8_t stride) { + task->depthwise = depthwise; + task->kernel_shape = kernel_shape; + task->subtile_output_channel = depthwise ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 + : NEUREKA_SUBTILE_OUTPUT_CHANNEL; + task->subtile_input_channel = kernel_shape == 3 + ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 + : NEUREKA_SUBTILE_INPUT_CHANNEL_1x1; const int flag_mode = kernel_shape == 1 ? NEUREKA_FLAG_MODE_1x1 : depthwise == 1 ? NEUREKA_FLAG_MODE_3x3_DW : NEUREKA_FLAG_MODE_3x3; - task->data.cfg.conf0 |= - flag_input_signed << NEUREKA_SHIFT_FLAG_INPUT_SIGNED | - NEUREKA_FLAG_NORM_QUANT | quant.function | quant.mode | - (quant.shift_amount << 16) | - quant.flag_rounding << NEUREKA_SHIFT_ROUNDING | norm.mode | - norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS | - norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT | NEUREKA_FLAG_USE_TCDM | - weights_offset_mode | flag_mode | (weights_bits - 1); + task->data.cfg.conf0 &= ~(NEUREKA_MASK_FLAG_MODE); + task->data.cfg.conf0 |= flag_mode; +} + +void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits, + const uint8_t output_bits, + const uint8_t weight_bits) { + neureka_quant_mode_e quantMode; + if (output_bits == 8) { + quantMode = quantMode8Bit; + } else { + quantMode = quantMode32Bit; + } + + task->qw = weight_bits; + task->data.cfg.conf0 &= + ~(NEUREKA_MASK_QUANT_MODE | NEUREKA_MASK_FLAG_WEIGHT_BITS); + task->data.cfg.conf0 |= quantMode | (weight_bits - 1); +} + +void neureka_task_set_norm_quant(neureka_task_t *task, neureka_quant_t quant, + neureka_norm_t norm) { + task->data.cfg.conf0 &= + ~(NEUREKA_MASK_QUANT_FUNCTION | NEUREKA_MASK_SHIFT_AMOUNT | + NEUREKA_MASK_NORM_MODE | NEUREKA_MASK_FLAG_NORM_BIAS | + NEUREKA_MASK_FLAG_NORM_SHIFT); + task->data.cfg.conf0 |= NEUREKA_FLAG_NORM_QUANT | quant.function | + (quant.shift_amount << 16) | norm.mode | + norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS | + norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT; +} + +void neureka_task_set_weight_offset( + neureka_task_t *task, neureka_weight_offset_mode_e weight_offset_mode, + const int32_t weight_offset) { + task->data.cfg.conf0 &= ~NEUREKA_MASK_WEIGHT_OFFSET_MODE; + task->data.cfg.conf0 |= weight_offset_mode; + task->data.cfg.weight_offset_factor = weight_offset; +} + +void neureka_task_set_input_signed(neureka_task_t *task) { + task->data.cfg.conf0 |= NEUREKA_FLAG_INPUT_SIGNED; +} + +void neureka_task_set_input_unsigned(neureka_task_t *task) { + task->data.cfg.conf0 &= ~NEUREKA_FLAG_INPUT_SIGNED; +} - task->data.cfg.weight_offset_factor = weights_offset_factor; +void neureka_task_set_weight_source(neureka_task_t *task, + neureka_weight_source_e weight_source) { + task->data.cfg.conf0 &= ~NEUREKA_MASK_FLAG_WEIGHT_SOURCE; + task->data.cfg.conf0 |= weight_source; } /** neureka_pad_ptr @@ -84,18 +122,18 @@ void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape, * Necessary for input pointer when it's padded. */ uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width, - const uint32_t channel, const uint8_t bits, - const uint8_t padding_top, - const uint8_t padding_left) { + const uint32_t channel, const uint8_t bits, + const uint8_t padding_top, + const uint8_t padding_left) { return ptr - (padding_top * width + padding_left) * channel * bits / 8; } void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr, - uint32_t w_in, uint32_t k_in, uint8_t bits_in, - uint8_t padding_top, uint8_t padding_left, - uint32_t output_ptr, uint32_t weights_ptr, - uint32_t scale_ptr, uint32_t shift_ptr, - uint32_t bias_ptr) { + uint32_t w_in, uint32_t k_in, uint8_t bits_in, + uint8_t padding_top, uint8_t padding_left, + uint32_t output_ptr, uint32_t weights_ptr, + uint32_t scale_ptr, uint32_t shift_ptr, + uint32_t bias_ptr) { task->data.infeat_ptr = neureka_pad_ptr(input_ptr, w_in, k_in, bits_in, padding_top, padding_left); task->data.outfeat_ptr = output_ptr; @@ -106,31 +144,28 @@ void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr, } void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in, + const uint32_t h_in_stride, const uint32_t w_in_stride, - const uint32_t k_in_stride, - const uint32_t w_out_stride, - const uint32_t k_out_stride) { - const uint32_t num_k_in = divnceil(k_in, task->input_channel_throughput); + const uint32_t h_out_stride, + const uint32_t w_out_stride) { + const uint32_t num_k_in = divnceil(k_in, task->subtile_input_channel); const neureka_stride_t input_stride = { - .d0 = k_in_stride, - .d1 = k_in_stride * w_in_stride, - .d2 = 0 // Unused - }; + .d0 = w_in_stride, .d1 = h_in_stride, .d2 = 0}; task->data.cfg.input_stride = input_stride; - const neureka_stride_t output_stride = { - .d0 = 32, // TODO: should depend on outbytes. Probably 32 / outbytes - .d1 = k_out_stride * task->outbytes, - .d2 = k_out_stride * task->outbytes * w_out_stride}; + const neureka_stride_t output_stride = {.d0 = NEUREKA_OUTPUT_BANDWIDTH_BYTES, + .d1 = w_out_stride, + .d2 = h_out_stride}; task->data.cfg.output_stride = output_stride; - task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_D0_STRIDE; + task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES; if (task->kernel_shape == 1) { // 1x1 - task->data.cfg.weights_stride.d1 = NEUREKA_WEIGHT_D0_STRIDE * num_k_in; + task->data.cfg.weights_stride.d1 = + NEUREKA_WEIGHT_BANDWIDTH_BYTES * num_k_in; } else if (!task->depthwise) { // 3x3 task->data.cfg.weights_stride.d1 = - NEUREKA_WEIGHT_D0_STRIDE * task->qw * num_k_in; + NEUREKA_WEIGHT_BANDWIDTH_BYTES * task->qw * num_k_in; } else { // 3x3 depthwise task->data.cfg.weights_stride.d1 = 0; } @@ -142,15 +177,15 @@ void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in, const uint32_t k_out, const uint8_t padding_bottom, const uint8_t padding_right) { - const uint16_t num_Ko = divnceil(k_out, task->output_channel_throughput); - const uint16_t num_Ki = divnceil(k_in, task->input_channel_throughput); - const uint16_t num_Ho = divnceil(h_out, NEUREKA_COMPUTE_SIZE_HEIGHT); - const uint16_t num_Wo = divnceil(w_out, NEUREKA_COMPUTE_SIZE_WIDTH); - - const uint16_t rem_Ko = remainder(k_out, task->output_channel_throughput); - const uint16_t rem_Ki = remainder(k_in, task->input_channel_throughput); - const uint16_t rem_Ho = remainder(h_out, NEUREKA_COMPUTE_SIZE_HEIGHT); - const uint16_t rem_Wo = remainder(w_out, NEUREKA_COMPUTE_SIZE_WIDTH); + const uint16_t num_Ko = divnceil(k_out, task->subtile_output_channel); + const uint16_t num_Ki = divnceil(k_in, task->subtile_input_channel); + const uint16_t num_Ho = divnceil(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT); + const uint16_t num_Wo = divnceil(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH); + + const uint16_t rem_Ko = remainder(k_out, task->subtile_output_channel); + const uint16_t rem_Ki = remainder(k_in, task->subtile_input_channel); + const uint16_t rem_Ho = remainder(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT); + const uint16_t rem_Wo = remainder(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH); const uint16_t rem_Hi = rem_Ho == 0 ? 0 : (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - @@ -170,30 +205,29 @@ void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in, } void neureka_task_set_padding(neureka_task_t *task, const uint8_t top, - const uint8_t bottom, const uint8_t left, - const uint8_t right, const uint8_t value) { + const uint8_t bottom, const uint8_t left, + const uint8_t right, const uint8_t value) { task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) | ((bottom & 0xf) << 20) | ((left & 0xf) << 16) | (value & 0xff); } -void neureka_task_set_mask_filter(neureka_task_t *task, - const uint8_t top, const uint8_t right, - const uint8_t bottom, - const uint8_t left) { +void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top, + const uint8_t right, const uint8_t bottom, + const uint8_t left) { task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) | ((bottom & 0xff) << 8) | ((left & 0xff) << 0); } void neureka_task_set_dims( neureka_task_t *task, const uint32_t w_in, const uint32_t k_in, - const uint32_t w_in_stride, const uint32_t k_in_stride, + const uint32_t h_in_stride, const uint32_t w_in_stride, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint32_t h_out_stride, const uint32_t w_out_stride, const uint8_t padding_top, const uint8_t padding_bottom, const uint8_t padding_right, const uint8_t padding_left) { - neureka_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride, - k_out_stride); + neureka_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride, + w_out_stride); neureka_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom, padding_right); neureka_task_set_padding(task, padding_top, padding_bottom, padding_left, diff --git a/neureka/hal/neureka_task.h b/neureka/hal/neureka_task.h index 64356e6..a265223 100644 --- a/neureka/hal/neureka_task.h +++ b/neureka/hal/neureka_task.h @@ -29,6 +29,11 @@ typedef enum neureka_task_flag_e { neurekaTaskFlagTrue = 1 } neureka_task_flag_e; +typedef enum neureka_weight_source_e { + neurekaWeightSourceTcdm = NEUREKA_FLAG_WEIGHT_SOURCE_TCDM, + neurekaWeightSourceWmem = NEUREKA_FLAG_WEIGHT_SOURCE_WMEM +} neureka_weight_source_e; + typedef enum neureka_weight_offset_mode_e { weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC, weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE @@ -36,7 +41,6 @@ typedef enum neureka_weight_offset_mode_e { typedef enum { normMode8Bit = NEUREKA_NORM_MODE_8BIT, - normMode16Bit = NEUREKA_NORM_MODE_16BIT, normMode32Bit = NEUREKA_NORM_MODE_32BIT } neureka_norm_mode_e; @@ -48,7 +52,6 @@ typedef struct neureka_norm_t { typedef enum neureka_quant_mode_e { quantMode8Bit = NEUREKA_QUANT_MODE_8BIT, - quantMode16Bit = NEUREKA_QUANT_MODE_16BIT, quantMode32Bit = NEUREKA_QUANT_MODE_32BIT } neureka_quant_mode_e; @@ -60,7 +63,6 @@ typedef enum neureka_quant_function_e { typedef struct neureka_quant_t { // Shift amount must be in range 0x00-0x1F unsigned shift_amount; - neureka_quant_mode_e mode; neureka_quant_function_e function; int flag_rounding; } neureka_quant_t; @@ -110,22 +112,30 @@ typedef struct neureka_task_data_t { typedef struct neureka_task_t { neureka_task_data_t data; - uint8_t outbytes; uint8_t qw; - uint8_t output_channel_throughput; - uint8_t input_channel_throughput; + uint8_t subtile_output_channel; + uint8_t subtile_input_channel; uint8_t kernel_shape; uint8_t depthwise; uint8_t id; } neureka_task_t; -void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape, - const uint8_t depthwise, const uint8_t input_bits, - const uint8_t output_bits, const uint8_t weights_bits, - const neureka_weight_offset_mode_e weights_offset_mode, - const uint32_t weights_offset_factor, - neureka_quant_t quant, neureka_norm_t norm, - const uint8_t flag_input_signed); +void neureka_task_init(neureka_task_t *task); +void neureka_task_set_op_to_conv(neureka_task_t *task, + const uint8_t kernel_shape, + const uint8_t depthwise, const uint8_t stride); +void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits, + const uint8_t output_bits, + const uint8_t weight_bits); +void neureka_task_set_norm_quant(neureka_task_t *task, neureka_quant_t quant, + neureka_norm_t norm); +void neureka_task_set_weight_offset( + neureka_task_t *task, neureka_weight_offset_mode_e weight_offset_mode, + const int32_t weight_offset); +void neureka_task_set_input_signed(neureka_task_t *task); +void neureka_task_set_input_unsigned(neureka_task_t *task); +void neureka_task_set_weight_source(neureka_task_t *task, + neureka_weight_source_e weight_source); uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height, uint32_t i_width, uint32_t n_height, uint32_t n_width); @@ -138,11 +148,17 @@ void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr, uint32_t output_ptr, uint32_t weights_ptr, uint32_t scale_ptr, uint32_t shift_ptr, uint32_t bias_ptr); +/** neureka_task_set_strides + * + * All the strides variables are strides between elements alongside that + * dimension and expressed in bytes. There is no stride variable for the channel + * dimension because the N-EUREKA requires the channels to be contiguous. + */ void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in, + const uint32_t h_in_stride, const uint32_t w_in_stride, - const uint32_t k_in_stride, - const uint32_t w_out_stride, - const uint32_t k_out_stride); + const uint32_t h_out_stride, + const uint32_t w_out_stride); void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, @@ -154,11 +170,17 @@ void neureka_task_set_padding(neureka_task_t *task, const uint8_t top, void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top, const uint8_t right, const uint8_t bottom, const uint8_t left); +/** neureka_task_set_dims + * + * All the strides variables are strides between elements alongside that + * dimension and expressed in bytes. There is no stride variable for the channel + * dimension because the N-EUREKA requires the channels to be contiguous. + */ void neureka_task_set_dims( neureka_task_t *task, const uint32_t w_in, const uint32_t k_in, - const uint32_t w_in_stride, const uint32_t k_in_stride, + const uint32_t h_in_stride, const uint32_t w_in_stride, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint32_t h_out_stride, const uint32_t w_out_stride, const uint8_t padding_top, const uint8_t padding_bottom, const uint8_t padding_right, const uint8_t padding_left); diff --git a/neureka/hal/neureka_task_defs.h b/neureka/hal/neureka_task_defs.h index 7ed77eb..fa08289 100644 --- a/neureka/hal/neureka_task_defs.h +++ b/neureka/hal/neureka_task_defs.h @@ -23,13 +23,20 @@ /* ARHITECTURE */ -#define NEUREKA_COMPUTE_SIZE_HEIGHT (6) -#define NEUREKA_COMPUTE_SIZE_WIDTH (6) -#define NEUREKA_INPUT_CHANNEL_THROUGHPUT_1x1 (32) -#define NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 (28) -#define NEUREKA_OUTPUT_CHANNEL_THROUGHPUT (32) -#define NEUREKA_WEIGHT_BANDWIDTH (256) -#define NEUREKA_WEIGHT_D0_STRIDE (NEUREKA_WEIGHT_BANDWIDTH / 8) +#define NEUREKA_SUBTILE_INPUT_HEIGHT_1x1 (6) +#define NEUREKA_SUBTILE_INPUT_WIDTH_1x1 (6) +#define NEUREKA_SUBTILE_INPUT_CHANNEL_1x1 (32) + +#define NEUREKA_SUBTILE_INPUT_HEIGHT_3x3 (8) +#define NEUREKA_SUBTILE_INPUT_WIDTH_3x3 (8) +#define NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 (28) + +#define NEUREKA_SUBTILE_OUTPUT_HEIGHT (6) +#define NEUREKA_SUBTILE_OUTPUT_WIDTH (6) +#define NEUREKA_SUBTILE_OUTPUT_CHANNEL (32) + +#define NEUREKA_OUTPUT_BANDWIDTH_BYTES (32) +#define NEUREKA_WEIGHT_BANDWIDTH_BYTES (32) /* TASK REGISTERS */ @@ -65,7 +72,6 @@ #define NEUREKA_SHIFT_FLAG_NORM_BIAS (25) #define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24) #define NEUREKA_SHIFT_QUANT_SHIFT (16) -#define NEUREKA_SHIFT_ROUNDING (11) /* CONF0 FLAGS */ @@ -75,7 +81,6 @@ #define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23) #define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23) #define NEUREKA_QUANT_MODE_8BIT (0 << 21) -#define NEUREKA_QUANT_MODE_16BIT (1 << 21) // not supported #define NEUREKA_QUANT_MODE_32BIT (2 << 21) // conf0[20:16] - quantization shift amount #define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15) // Unimplemented in gvsoc @@ -83,25 +88,30 @@ (1 << 15) // Unimplemented in gvsoc #define NEUREKA_FLAG_STREAMIN (1 << 14) #define NEUREKA_NORM_MODE_8BIT (0 << 12) -#define NEUREKA_NORM_MODE_16BIT (1 << 12) // not supported #define NEUREKA_NORM_MODE_32BIT (2 << 12) -#define NEUREKA_FLAG_ROUND (1 << 11) // not supported #define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10) -#define NEUREKA_FLAG_USE_WMEM (1 << 9) -#define NEUREKA_FLAG_USE_TCDM (0 << 9) -#define NEUREKA_FLAG_STRIDE_2x2 (1 << 8) // not supported -#define NEUREKA_FLAG_LINEAR_MODE (1 << 7) // not supported +#define NEUREKA_FLAG_WEIGHT_SOURCE_WMEM (1 << 9) +#define NEUREKA_FLAG_WEIGHT_SOURCE_TCDM (0 << 9) +#define NEUREKA_FLAG_LINEAR_MODE (1 << 7) // not tested #define NEUREKA_FLAG_MODE_3x3 (0 << 5) #define NEUREKA_FLAG_MODE_3x3_DW (1 << 5) #define NEUREKA_FLAG_MODE_1x1 (2 << 5) #define NEUREKA_FLAG_NORM_QUANT (1 << 4) -#define NEUREKA_FLAG_MODE_BASIC (0 << 3) -#define NEUREKA_FLAG_MODE16 (1 << 3) // not supported /* Masks */ -#define NEUREKA_MASK_QUANT_FUNCTION (1 << 23) -#define NEUREKA_MASK_QUANT_MODE (3 << 21) +#define NEUREKA_MASK_FLAG_INPUT_SIGNED (0x1 << 26) +#define NEUREKA_MASK_FLAG_NORM_BIAS (0x1 << 25) +#define NEUREKA_MASK_FLAG_NORM_SHIFT (0x1 << 24) +#define NEUREKA_MASK_QUANT_FUNCTION (0x1 << 23) +#define NEUREKA_MASK_QUANT_MODE (0x3 << 21) +#define NEUREKA_MASK_SHIFT_AMOUNT (0x1f << 16) +#define NEUREKA_MASK_WEIGHT_OFFSET_MODE (0x1 << 15) +#define NEUREKA_MASK_NORM_MODE (0x3 << 12) +#define NEUREKA_MASK_FLAG_ACTIVATION_PREFETCH (0x1 << 10) +#define NEUREKA_MASK_FLAG_WEIGHT_SOURCE (0x1 << 9) +#define NEUREKA_MASK_FLAG_MODE (0x3 << 5) +#define NEUREKA_MASK_FLAG_WEIGHT_BITS (0x7 << 0) /* PADDING */ diff --git a/test/NeurekaTestConf.py b/test/NeurekaTestConf.py index d896a7e..f878e68 100644 --- a/test/NeurekaTestConf.py +++ b/test/NeurekaTestConf.py @@ -59,7 +59,7 @@ def check_valid_in_type(cls, v: IntegerType) -> IntegerType: @field_validator("out_type") @classmethod def check_valid_out_type(cls, v: IntegerType) -> IntegerType: - NeurekaTestConf._check_type("out_type", v, ["uint8", "int8"]) + NeurekaTestConf._check_type("out_type", v, ["uint8", "int8", "int32"]) return v @field_validator("weight_type") diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c index 893f2fc..41317f6 100644 --- a/test/app/src/nnx_layer.c +++ b/test/app/src/nnx_layer.c @@ -29,6 +29,7 @@ #include "ne16_task.h" #include "pulp_nnx_ne16.h" +typedef ne16_norm_mode_e nnx_norm_mode_e; typedef ne16_quant_t nnx_quant_t; typedef ne16_norm_t nnx_norm_t; typedef ne16_task_t nnx_task_t; @@ -39,6 +40,10 @@ typedef ne16_pulp_conf_t nnx_bsp_conf_t; #define nnxTaskFlagFalse ne16TaskFlagFalse #define nnx_task_init ne16_task_init +#define nnx_task_set_op_to_conv ne16_task_set_op_to_conv +#define nnx_task_set_bits ne16_task_set_bits +#define nnx_task_set_norm_quant ne16_task_set_norm_quant +#define nnx_task_set_weight_offset ne16_task_set_weight_offset #define nnx_task_set_dims ne16_task_set_dims #define nnx_task_set_dims_stride2x2 ne16_task_set_dims_stride2x2 #define nnx_task_set_ptrs ne16_task_set_ptrs @@ -65,6 +70,7 @@ typedef ne16_pulp_conf_t nnx_bsp_conf_t; #include "neureka_task.h" #include "pulp_nnx_neureka.h" +typedef neureka_norm_mode_e nnx_norm_mode_e; typedef neureka_quant_t nnx_quant_t; typedef neureka_norm_t nnx_norm_t; typedef neureka_task_t nnx_task_t; @@ -75,6 +81,10 @@ typedef neureka_siracusa_conf_t nnx_bsp_conf_t; #define nnxTaskFlagFalse neurekaTaskFlagFalse #define nnx_task_init neureka_task_init +#define nnx_task_set_op_to_conv neureka_task_set_op_to_conv +#define nnx_task_set_bits neureka_task_set_bits +#define nnx_task_set_norm_quant neureka_task_set_norm_quant +#define nnx_task_set_weight_offset neureka_task_set_weight_offset #define nnx_task_set_dims neureka_task_set_dims #define nnx_task_set_ptrs neureka_task_set_ptrs @@ -103,17 +113,17 @@ typedef neureka_siracusa_conf_t nnx_bsp_conf_t; static void task_prepare(nnx_task_t *task) { nnx_task_init(task); - ne16_task_set_op_to_conv(task, WEIGHT_HEIGHT, GROUPS > 1, STRIDE_HEIGHT); - ne16_task_set_bits(task, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS); + nnx_task_set_op_to_conv(task, WEIGHT_HEIGHT, GROUPS > 1, STRIDE_HEIGHT); + nnx_task_set_bits(task, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS); #if HAS_NORM_QUANT == 1 #if SCALE_BITS == 8 - const ne16_norm_mode_e normMode = normMode8Bit; + const nnx_norm_mode_e normMode = normMode8Bit; #elif SCALE_BITS == 32 - const ne16_norm_mode_e normMode = normMode32Bit; + const nnx_norm_mode_e normMode = normMode32Bit; #endif - ne16_task_set_norm_quant( + nnx_task_set_norm_quant( task, (nnx_quant_t){.shift_amount = OUTSHIFT, .function = @@ -123,8 +133,21 @@ static void task_prepare(nnx_task_t *task) { .flag_bias = HAS_BIAS ? nnxTaskFlagTrue : nnxTaskFlagFalse, .flag_shift = nnxTaskFlagFalse}); #endif // HAS_NORM_QUANT - // - ne16_task_set_weight_offset(task, weightOffsetModeLayerWise, WEIGHT_OFFSET); + + nnx_task_set_weight_offset(task, weightOffsetModeLayerWise, WEIGHT_OFFSET); + +#ifdef NNX_NEUREKA +#ifdef NEUREKA_WEIGHT_SOURCE_WMEM + neureka_task_set_weight_source(task, neurekaWeightSourceWmem); +#else + neureka_task_set_weight_source(task, neurekaWeightSourceTcdm); +#endif +#if INPUT_SIGNED == 1 + neureka_task_set_input_signed(task); +#else + neureka_task_set_input_unsigned(task); +#endif +#endif const uint32_t w_in_stride = INPUT_CHANNEL * INPUT_BITS / 8; const uint32_t h_in_stride = INPUT_WIDTH * w_in_stride;