From 8cd796548004b9dcff1119b6a63f64f3a9091db0 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Sun, 14 Jan 2024 19:37:51 +0100
Subject: [PATCH 01/72] Add neureka support similar to ne16

---
 inc/pulp_nnx_neureka.h                        |  77 ++++
 neureka/bsp/neureka_siracusa_bsp.c            |  92 ++++
 neureka/bsp/neureka_siracusa_bsp.h            |  81 ++++
 neureka/gvsoc/neureka_gvsoc.h                 |  54 +++
 .../pulp_nnx_error_codes.h => hal/neureka.c}  |  27 +-
 .../{src/pulp_nnx_util.c => hal/neureka.h}    |  22 +-
 neureka/hal/neureka_task.c                    | 234 ++++++++++
 neureka/hal/neureka_task.h                    | 173 ++++++++
 neureka/hal/neureka_task_defs.h               | 114 +++++
 neureka/inc/pulp_nnx_defs.h                   | 167 -------
 neureka/inc/pulp_nnx_hal.h                    | 217 ---------
 neureka/inc/pulp_nnx_util.h                   |  27 --
 neureka/src/pulp_nnx_hal.c                    | 412 ------------------
 src/pulp_nnx_neureka.c                        | 131 ++++++
 14 files changed, 987 insertions(+), 841 deletions(-)
 create mode 100644 inc/pulp_nnx_neureka.h
 create mode 100644 neureka/bsp/neureka_siracusa_bsp.c
 create mode 100644 neureka/bsp/neureka_siracusa_bsp.h
 create mode 100644 neureka/gvsoc/neureka_gvsoc.h
 rename neureka/{inc/pulp_nnx_error_codes.h => hal/neureka.c} (53%)
 rename neureka/{src/pulp_nnx_util.c => hal/neureka.h} (62%)
 create mode 100644 neureka/hal/neureka_task.c
 create mode 100644 neureka/hal/neureka_task.h
 create mode 100644 neureka/hal/neureka_task_defs.h
 delete mode 100644 neureka/inc/pulp_nnx_defs.h
 delete mode 100644 neureka/inc/pulp_nnx_hal.h
 delete mode 100644 neureka/inc/pulp_nnx_util.h
 delete mode 100644 neureka/src/pulp_nnx_hal.c
 create mode 100644 src/pulp_nnx_neureka.c

diff --git a/inc/pulp_nnx_neureka.h b/inc/pulp_nnx_neureka.h
new file mode 100644
index 0000000..cabf30a
--- /dev/null
+++ b/inc/pulp_nnx_neureka.h
@@ -0,0 +1,77 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "neureka.h"
+#include "neureka_siracusa_bsp.h"
+#include "neureka_task.h"
+#include <stdint.h>
+
+/* PULP-NNX interface */
+
+void neureka_nnx_init(neureka_dev_t *dev, neureka_siracusa_conf_t *conf);
+void neureka_nnx_term(neureka_dev_t *dev);
+
+/** neureka_nnx_dispatch_check
+ *
+ * Check whether you can dispatch to the accelerator.
+ */
+int neureka_nnx_dispatch_check(neureka_dev_t *dev);
+
+/** neureka_nnx_dispatch_wait
+ *
+ * Block until you can dispatch to the accelerator.
+ */
+void neureka_nnx_dispatch_wait(neureka_dev_t *dev);
+
+/** neureka_nnx_dispatch
+ *
+ * Dispatch a task to the accelerator.
+ * Fails with return code 1 if the task cannot be dispatched. Otherwise returns 0.
+ */
+int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task);
+
+/** neureka_nnx_resolve_check
+ *
+ * Check whether the task has been resolved.
+ */
+int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task);
+
+/** neureka_nnx_resolve_wait
+ *
+ * Block until you can resolve the task.
+ */
+void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task);
+
+
+/* Additional helper functions */
+
+/** neureka_nnx_dispatch_stride2x2
+ *
+ * It uses Neureka's 2x2 strided mode which reduces the number of writes Neureka does.
+ * This mode doesn't stride the Neureka's subtile input pointer, so we have to
+ * tile the tile to the subtile's spatial dimensions (in this case 3x3 output).
+ * Works only if the k_out is divisible by 2.
+ */
+void neureka_nnx_dispatch_stride2x2(
+    neureka_dev_t *dev, neureka_task_t *task, const uint32_t w_in, const uint32_t k_in,
+    const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint8_t h_ker, const uint8_t w_ker);
diff --git a/neureka/bsp/neureka_siracusa_bsp.c b/neureka/bsp/neureka_siracusa_bsp.c
new file mode 100644
index 0000000..78ef09a
--- /dev/null
+++ b/neureka/bsp/neureka_siracusa_bsp.c
@@ -0,0 +1,92 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "neureka_siracusa_bsp.h"
+#include <pmsis.h>
+
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR (0x00200000)
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS 0x18
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR                                       \
+  (NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR + NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS)
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_CG_EN 0x800
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO 0x100
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL 0xff
+#define NEUREKA_SIRACUSA_MAX_STALL (8)
+#define NEUREKA_SIRACUSA_EVENT (1 << 12)
+#define NEUREKA_SIRACUSA_BASE_ADDR (0x00201000)
+#define NEUREKA_SIRACUSA_WEIGHT_MEM_BASE_ADDR (0x10400000)
+#define NEUREKA_SIRACUSA_WEIGHT_MEM_MRAM_OFFSET (0x00000000)
+#define NEUREKA_SIRACUSA_WEIGHT_MEM_SRAM_OFFSET (0x00400000)
+
+void neureka_siracusa_cg_enable() {
+  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |=
+      NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_CG_EN;
+}
+
+void neureka_siracusa_cg_disable() {
+  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &=
+      ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_CG_EN;
+}
+
+// TODO: Check if needed for neureka
+void neureka_siracusa_hci_setpriority_neureka() {
+  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |=
+      NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO;
+}
+
+// TODO: Check if needed for neureka
+void neureka_siracusa_hci_setpriority_core() {
+  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &=
+      ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO;
+}
+
+// TODO: Check if needed for neureka
+void neureka_siracusa_hci_reset_max_stall() {
+  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &=
+      ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL;
+}
+
+// TODO: Check if needed for neureka
+void neureka_siracusa_hci_set_max_stall(uint32_t max_stall) {
+  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |=
+      max_stall & NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL;
+}
+
+void neureka_siracusa_open(neureka_siracusa_conf_t *conf) {
+  neureka_siracusa_cg_enable();
+  neureka_siracusa_hci_setpriority_neureka();
+  neureka_siracusa_hci_set_max_stall(conf->max_stall);
+}
+
+void neureka_siracusa_close() {
+  neureka_siracusa_hci_reset_max_stall();
+  neureka_siracusa_hci_setpriority_core();
+  neureka_siracusa_cg_disable();
+}
+
+void neureka_siracusa_event_wait_and_clear() {
+  eu_evt_maskWaitAndClr(NEUREKA_SIRACUSA_EVENT);
+}
+
+static const neureka_dev_t neureka_siracusa_dev = {
+    .hwpe_dev = (struct hwpe_dev_t){
+        .base_addr = (volatile uint32_t *)NEUREKA_SIRACUSA_BASE_ADDR}};
+
+const neureka_dev_t *neureka_siracusa_get_dev() { return &neureka_siracusa_dev; }
diff --git a/neureka/bsp/neureka_siracusa_bsp.h b/neureka/bsp/neureka_siracusa_bsp.h
new file mode 100644
index 0000000..9e879e8
--- /dev/null
+++ b/neureka/bsp/neureka_siracusa_bsp.h
@@ -0,0 +1,81 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NEUREKA_siracusa_BSP_H__
+#define __NEUREKA_siracusa_BSP_H__
+
+#include "neureka.h"
+#include <stdint.h>
+
+/**
+ * neureka_siracusa_cg_enable
+ *
+ * Enable clock gating of the neureka.
+ */
+void neureka_siracusa_cg_enable();
+
+/**
+ * neureka_siracusa_cg_enable
+ *
+ * Disable clock gating of the neureka.
+ */
+void neureka_siracusa_cg_disable();
+
+/**
+ * neureka_siracusa_setpriority_neureka
+ *
+ * Set HCI interconnect bus priority to prioritize neureka.
+ */
+void neureka_siracusa_hci_setpriority_neureka();
+
+/**
+ * neureka_siracusa_setpriority_core
+ *
+ * Set HCI bus priority to prioritize cores.
+ */
+void neureka_siracusa_hci_setpriority_core();
+
+/**
+ * neureka_siracusa_hci_reset_maxstall
+ *
+ * Reset the HCI bus maxstall parameter.
+ * TODO: Check if it disables it also or just resets?
+ */
+void neureka_siracusa_hci_reset_max_stall();
+
+/**
+ * neureka_siracusa_hci_set_maxstall
+ *
+ * Set the HCI bus maxstall. Maxstall defines how many cycles
+ * will the HCI bus stall the lower priority master, i.e. neureka or core,
+ * before letting it do a transaction.
+ */
+void neureka_siracusa_hci_set_max_stall(uint32_t max_stall);
+
+typedef struct neureka_siracusa_conf_t {
+  int max_stall;
+} neureka_siracusa_conf_t;
+
+void neureka_siracusa_open(neureka_siracusa_conf_t *conf);
+void neureka_siracusa_close();
+void neureka_siracusa_event_wait_and_clear();
+const neureka_dev_t *neureka_siracusa_get_dev();
+
+#endif // !__NEUREKA_siracusa_BSP_H__
diff --git a/neureka/gvsoc/neureka_gvsoc.h b/neureka/gvsoc/neureka_gvsoc.h
new file mode 100644
index 0000000..37eeab0
--- /dev/null
+++ b/neureka/gvsoc/neureka_gvsoc.h
@@ -0,0 +1,54 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NEUREKA_GVSOC_H__
+#define __NEUREKA_GVSOC_H__
+
+#include "neureka.h"
+#include "neureka_task.h"
+
+#define NEUREKA_REG_GVSOC_LOG_LEVEL 24
+#define NEUREKA_REG_GVSOC_LOG_FORMAT 25
+
+typedef enum neureka_gvsoc_log_format_e {
+  NEUREKA_GVSOC_LOG_FORMAT_DECIMAL = 0,
+  NEUREKA_GVSOC_LOG_FORMAT_HEXADECIMAL = 3
+} neureka_gvsoc_log_format_e;
+
+typedef enum neureka_gvsoc_log_level_e {
+  NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END = 0,
+  NEUREKA_GVSOC_LOG_LEVEL_CONFIG = 1,
+  NEUREKA_GVSOC_LOG_LEVEL_ACTIV_INOUT = 2,
+  NEUREKA_GVSOC_LOG_LEVEL_ALL = 3
+} neureka_gvsoc_log_level_e;
+
+static void neureka_gvsoc_log_activate(neureka_dev_t *dev,
+                                       neureka_gvsoc_log_level_e log_level,
+                                       neureka_gvsoc_log_format_e format) {
+  hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL, log_level);
+  hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_FORMAT, format);
+}
+
+static void neureka_gvsoc_log_deactivate(neureka_dev_t *dev) {
+  hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL,
+                      NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END);
+}
+
+#endif // __NEUREKA_GVSOC_H__
diff --git a/neureka/inc/pulp_nnx_error_codes.h b/neureka/hal/neureka.c
similarity index 53%
rename from neureka/inc/pulp_nnx_error_codes.h
rename to neureka/hal/neureka.c
index dc71575..ebcad93 100644
--- a/neureka/inc/pulp_nnx_error_codes.h
+++ b/neureka/hal/neureka.c
@@ -18,15 +18,22 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#ifndef __NE16_ERROR_CODES_H__
-#define __NE16_ERROR_CODES_H__
+#include "neureka.h"
 
-typedef enum {
-  success = 0,
-  weightBitwidthOutOfBounds,
-  unsupportedWeightOffsetMode,
-  unsupportedFeatureBitwidth,
-  dimensionMismatch
-} nnx_error_code;
+#define NEUREKA_STATUS_EMPTY (0x000)
+#define NEUREKA_STATUS_FULL (0x101)
 
-#endif // __NE16_ERROR_CODES_H__
\ No newline at end of file
+inline int neureka_task_queue_size(neureka_dev_t *dev) { return 2; }
+
+inline int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev) {
+  uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev);
+  return (status & 0x1) + ((status >> 8) & 0x1);
+}
+
+inline int neureka_task_queue_empty(neureka_dev_t *dev) {
+  return hwpe_task_queue_status(&dev->hwpe_dev) == NEUREKA_STATUS_EMPTY;
+}
+
+inline int neureka_task_queue_full(neureka_dev_t *dev) {
+  return hwpe_task_queue_status(&dev->hwpe_dev) == NEUREKA_STATUS_FULL;
+}
diff --git a/neureka/src/pulp_nnx_util.c b/neureka/hal/neureka.h
similarity index 62%
rename from neureka/src/pulp_nnx_util.c
rename to neureka/hal/neureka.h
index daaaf2b..887d995 100644
--- a/neureka/src/pulp_nnx_util.c
+++ b/neureka/hal/neureka.h
@@ -18,13 +18,19 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "pulp_nnx_util.h"
-#include "pulp_nnx_hal.h"
+#ifndef __NEUREKA_H__
+#define __NEUREKA_H__
 
-void nnx_activate_gvsoc_logging(int log_level) {
-  NEUREKA_WRITE_IO_REG(NEUREKA_REG_GVSOC_TRACE, log_level);
-}
+#include "hwpe.h"
+#include <stdint.h>
 
-void nnx_deactivate_gvsoc_logging() {
-  NEUREKA_WRITE_IO_REG(NEUREKA_REG_GVSOC_TRACE, 0);
-}
+typedef struct neureka_dev_t {
+  hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */
+} neureka_dev_t;
+
+int neureka_task_queue_size(neureka_dev_t *dev);
+int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev);
+int neureka_task_queue_empty(neureka_dev_t *dev);
+int neureka_task_queue_full(neureka_dev_t *dev);
+
+#endif // __NEUREKA_H__
diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
new file mode 100644
index 0000000..943c373
--- /dev/null
+++ b/neureka/hal/neureka_task.c
@@ -0,0 +1,234 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "neureka_task.h"
+#include "neureka_task_defs.h"
+#include "pulp_nnx_util.h"
+
+inline uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
+                                      uint32_t i_width, uint32_t n_height,
+                                      uint32_t n_width) {
+  uint32_t tile_padding = padding;
+  if (i_height > 0) {
+    tile_padding &= ~(0xf << 28);
+  }
+  if (i_width < n_width - 1) {
+    tile_padding &= ~(0xf << 24);
+  }
+  if (i_height < n_height - 1) {
+    tile_padding &= ~(0xf << 20);
+  }
+  if (i_width > 0) {
+    tile_padding &= ~(0xf << 16);
+  }
+  return tile_padding;
+}
+
+void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
+                       const uint8_t depthwise, const uint8_t input_bits,
+                       const uint8_t output_bits, const uint8_t weights_bits,
+                       const neureka_weight_offset_mode_e weights_offset_mode,
+                       const uint32_t weights_offset_factor, neureka_quant_t quant,
+                       neureka_norm_t norm, const uint8_t stride) {
+  const uint32_t flag_mode16 =
+      input_bits == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC;
+
+  *task = (neureka_task_t){
+      .outbytes = output_bits / 8,
+      .weight_d0_stride = flag_mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16
+                                      : kernel_shape == 3 ? 
+                                        NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 :
+                                        NEUREKA_WEIGHT_D0_STRIDE_MODE8,
+      .qw = weights_bits,
+      .stride_shift = stride == 2 ? 1 : 0,
+      .output_channel_throughput = depthwise ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3
+                                             : NEUREKA_OUTPUT_CHANNEL_THROUGHPUT,
+      .input_channel_throughput = kernel_shape == 3 ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3
+                                                    : NEUREKA_INPUT_CHANNEL_THROUGHPUT,
+      .kernel_shape = kernel_shape,
+      .depthwise = depthwise,
+      .data = {0}};
+
+  const int flag_stride2x2 = stride == 2 ? NEUREKA_FLAG_STRIDE_2x2 : 0;
+
+  const int flag_mode = kernel_shape == 1 ? NEUREKA_FLAG_MODE_1x1
+                        : depthwise == 1  ? NEUREKA_FLAG_MODE_3x3_DW
+                                          : NEUREKA_FLAG_MODE_3x3;
+
+  task->data.cfg.conf0 |=
+      NEUREKA_FLAG_NORM_QUANT | quant.function | quant.mode |
+      (quant.shift_amount << 16) | quant.flag_rounding << NEUREKA_SHIFT_ROUNDING |
+      norm.mode | norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS |
+      norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT | weights_offset_mode |
+      flag_mode | flag_mode16 | (weights_bits - 1) | flag_stride2x2;
+
+  task->data.cfg.weight_offset_factor = weights_offset_factor;
+}
+
+/** neureka_pad_ptr
+ *
+ * Calculate the pointer to the start of the ptr as if
+ * it was the start to the padded data.
+ * Necessary for input pointer when it's padded.
+ */
+inline uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
+                             const uint32_t channel, const uint8_t bits,
+                             const uint8_t padding_top,
+                             const uint8_t padding_left) {
+  return ptr - (padding_top * width + padding_left) * channel * bits / 8;
+}
+
+inline void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
+                               uint32_t w_in, uint32_t k_in, uint8_t bits_in,
+                               uint8_t padding_top, uint8_t padding_left,
+                               uint32_t output_ptr, uint32_t weights_ptr,
+                               uint32_t scale_ptr, uint32_t shift_ptr,
+                               uint32_t bias_ptr) {
+  task->data.infeat_ptr =
+      neureka_pad_ptr(input_ptr, w_in, k_in, bits_in, padding_top, padding_left);
+  task->data.outfeat_ptr = output_ptr;
+  task->data.weights_ptr = weights_ptr;
+  task->data.scale_ptr = scale_ptr;
+  task->data.scale_shift_ptr = shift_ptr;
+  task->data.scale_bias_ptr = bias_ptr;
+}
+
+void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
+                           const uint32_t w_in_stride,
+                           const uint32_t k_in_stride,
+                           const uint32_t w_out_stride,
+                           const uint32_t k_out_stride) {
+  const uint32_t num_k_in = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT);
+
+  const neureka_stride_t input_stride = {
+      .d0 = k_in_stride,
+      .d1 = k_in_stride * w_in_stride,
+      .d2 = task->depthwise ? 0 : 
+            task->kernel_shape == 1 ? k_in_stride * 3 * 3 : // TODO: Check this magic
+              k_in_stride * NEUREKA_FILTER_BUFFER_SIZE * NEUREKA_FILTER_BUFFER_SIZE};
+  task->data.cfg.input_stride = input_stride;
+
+  // WARNING: Stride works only for even output channel sizes (divisible by 2)
+  const neureka_stride_t output_stride = {
+      .d0 = 32,
+      .d1 = (k_out_stride * task->outbytes) >> task->stride_shift,
+      .d2 =
+          (k_out_stride * task->outbytes * w_out_stride) >> task->stride_shift};
+  task->data.cfg.output_stride = output_stride;
+
+  if (task->kernel_shape == 1) {
+    task->data.cfg.weights_stride.d0 = task->weight_d0_stride * task->qw;
+    task->data.cfg.weights_stride.d1 =
+        task->weight_d0_stride * task->qw * num_k_in;
+    task->data.cfg.weights_stride.d2 = 0;
+  } else if (!task->depthwise) {
+    task->data.cfg.weights_stride.d0 = task->weight_d0_stride;
+    task->data.cfg.weights_stride.d1 = task->weight_d0_stride * task->qw *
+                                       num_k_in;
+    task->data.cfg.weights_stride.d2 = 0;
+  } else {
+    task->data.cfg.weights_stride.d0 =
+        NEUREKA_FILTER_SIZE * NEUREKA_FILTER_SIZE * task->weight_d0_stride;
+    task->data.cfg.weights_stride.d1 = 0;
+    task->data.cfg.weights_stride.d2 = 0;
+  }
+}
+
+void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
+                            const uint32_t h_out, const uint32_t w_out,
+                            const uint32_t k_out, const uint8_t padding_bottom,
+                            const uint8_t padding_right) {
+  const uint16_t num_Ko = divnceil(k_out, task->output_channel_throughput);
+  const uint16_t num_Ki = divnceil(k_in, task->input_channel_throughput);
+  const uint16_t num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE);
+  const uint16_t num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE);
+
+  const uint16_t rem_Ko = remainder(k_out, task->output_channel_throughput);
+  const uint16_t rem_Ki = remainder(k_in, task->input_channel_throughput);
+  const uint16_t rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE);
+  const uint16_t rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE);
+  const uint16_t rem_Hi =
+      (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom; // TODO: Check padding bottom
+  const uint16_t rem_Wi =
+      (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right; // TODO: Check padding right
+
+  const neureka_subtile_t subtile = {
+      .number = {.KoKi = concat_half(num_Ko, num_Ki),
+                 .HoWo = concat_half(num_Ho, num_Wo)},
+      .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
+                    .HoWo = concat_half(rem_Ho, rem_Wo),
+                    .HiWi = concat_half(rem_Hi, rem_Wi)}};
+  task->data.cfg.subtile = subtile;
+}
+
+inline void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
+                                  const uint8_t bottom, const uint8_t left,
+                                  const uint8_t right, const uint8_t value) {
+  task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) |
+                           ((bottom & 0xf) << 20) | ((left & 0xf) << 16) |
+                           (value & 0xff);
+}
+
+inline void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
+                                      const uint8_t right, const uint8_t bottom,
+                                      const uint8_t left) {
+  task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
+                               ((bottom & 0xff) << 8) | ((left & 0xff) << 0);
+}
+
+void neureka_task_set_dims(neureka_task_t *task, const uint32_t w_in,
+                        const uint32_t k_in, const uint32_t w_in_stride,
+                        const uint32_t k_in_stride, const uint32_t h_out,
+                        const uint32_t w_out, const uint32_t k_out,
+                        const uint32_t w_out_stride, const uint32_t k_out_stride,
+                        const uint8_t padding_top, const uint8_t padding_bottom,
+                        const uint8_t padding_right,
+                        const uint8_t padding_left) {
+  neureka_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
+                        k_out_stride);
+  neureka_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,
+                         padding_right);
+  neureka_task_set_padding(task, padding_top, padding_bottom, padding_left,
+                        padding_right, 0);
+}
+
+void neureka_task_set_dims_stride2x2(
+    neureka_task_t *task, const uint32_t h_in, const uint32_t w_in,
+    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
+    const uint8_t padding_bottom, const uint8_t padding_right,
+    const uint8_t padding_left) {
+  const uint8_t stride = 2;
+
+  neureka_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
+                        k_out_stride);
+  neureka_task_set_counters(task, k_in, h_out > 1 ? 3 : 1, w_out > 1 ? 3 : 1,
+                         k_out, h_in + padding_top >= 5 ? 0 : padding_bottom, 0);
+
+  const uint8_t padding_bottom_new =
+      (h_in + padding_top - h_ker) % stride == 0 ? 0 : padding_bottom;
+  const uint8_t padding_right_new =
+      (w_in + padding_left - w_ker) % stride == 0 ? 0 : padding_right;
+
+  neureka_task_set_padding(task, padding_top, padding_bottom_new, padding_left,
+                        padding_right_new, 0);
+}
diff --git a/neureka/hal/neureka_task.h b/neureka/hal/neureka_task.h
new file mode 100644
index 0000000..7f4c31b
--- /dev/null
+++ b/neureka/hal/neureka_task.h
@@ -0,0 +1,173 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NEUREKA_TASK_H__
+#define __NEUREKA_TASK_H__
+
+#include "neureka_task_defs.h"
+#include <stdint.h>
+
+typedef enum neureka_task_flag_e {
+  neurekaTaskFlagFalse = 0,
+  neurekaTaskFlagTrue = 1
+} neureka_task_flag_e;
+
+typedef enum neureka_weight_offset_mode_e {
+  weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC,
+  weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE
+} neureka_weight_offset_mode_e;
+
+typedef enum {
+  normMode8Bit = NEUREKA_NORM_MODE_8BIT,
+  normMode16Bit = NEUREKA_NORM_MODE_16BIT,
+  normMode32Bit = NEUREKA_NORM_MODE_32BIT
+} neureka_norm_mode_e;
+
+typedef struct neureka_norm_t {
+  neureka_norm_mode_e mode;
+  int flag_bias;
+  int flag_shift;
+} neureka_norm_t;
+
+typedef enum neureka_quant_mode_e {
+  quantMode8Bit = NEUREKA_QUANT_MODE_8BIT,
+  quantMode16Bit = NEUREKA_QUANT_MODE_16BIT,
+  quantMode32Bit = NEUREKA_QUANT_MODE_32BIT
+} neureka_quant_mode_e;
+
+typedef enum neureka_quant_function_e {
+  quantFunctionIdentity = NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY,
+  quantFunctionRelu = NEUREKA_FLAG_QUANT_FUNCTION_RELU
+} neureka_quant_function_e;
+
+typedef struct neureka_quant_t {
+  // Shift amount must be in range 0x00-0x1F
+  unsigned shift_amount;
+  neureka_quant_mode_e mode;
+  neureka_quant_function_e function;
+  int flag_rounding;
+} neureka_quant_t;
+
+typedef struct neureka_stride_t {
+  uint32_t d0;
+  uint32_t d1;
+  uint32_t d2;
+} neureka_stride_t;
+
+typedef struct neureka_subtile_remainder_t {
+  uint32_t KoKi;
+  uint32_t HoWo;
+  uint32_t HiWi;
+} neureka_subtile_remainder_t;
+
+typedef struct neureka_subtile_number_t {
+  uint32_t KoKi;
+  uint32_t HoWo;
+} neureka_subtile_number_t;
+
+typedef struct neureka_subtile_t {
+  neureka_subtile_remainder_t remainder;
+  neureka_subtile_number_t number;
+} neureka_subtile_t;
+
+typedef struct neureka_cfg_t {
+  neureka_stride_t input_stride;
+  neureka_stride_t output_stride;
+  neureka_stride_t weights_stride;
+  neureka_subtile_t subtile;
+  uint32_t padding;
+  uint32_t weight_offset_factor;
+  uint32_t filter_mask;
+  uint32_t conf0;
+} neureka_cfg_t;
+
+typedef struct neureka_task_data_t {
+  uint32_t weights_ptr;
+  uint32_t infeat_ptr;
+  uint32_t outfeat_ptr;
+  uint32_t scale_ptr;
+  uint32_t scale_shift_ptr;
+  uint32_t scale_bias_ptr;
+  neureka_cfg_t cfg;
+} neureka_task_data_t;
+
+typedef struct neureka_task_t {
+  neureka_task_data_t data;
+  uint8_t outbytes;
+  uint8_t weight_d0_stride;
+  uint8_t qw;
+  uint8_t stride_shift;
+  uint8_t output_channel_throughput;
+  uint8_t input_channel_throughput;
+  uint8_t kernel_shape;
+  uint8_t depthwise;
+  uint8_t id;
+} neureka_task_t;
+
+void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
+                    const uint8_t depthwise, const uint8_t input_bits,
+                    const uint8_t output_bits, const uint8_t weights_bits,
+                    const neureka_weight_offset_mode_e weights_offset_mode,
+                    const uint32_t weights_offset_factor, neureka_quant_t quant,
+                    neureka_norm_t norm, const uint8_t stride);
+uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
+                               uint32_t i_width, uint32_t n_height,
+                               uint32_t n_width);
+uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
+                      const uint32_t channel, const uint8_t bits,
+                      const uint8_t padding_top, const uint8_t padding_left);
+void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr, uint32_t w_in,
+                        uint32_t k_in, uint8_t bits_in, uint8_t padding_top,
+                        uint8_t padding_left, uint32_t output_ptr,
+                        uint32_t weights_ptr, uint32_t scale_ptr,
+                        uint32_t shift_ptr, uint32_t bias_ptr);
+void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
+                           const uint32_t w_in_stride,
+                           const uint32_t k_in_stride,
+                           const uint32_t w_out_stride,
+                           const uint32_t k_out_stride);
+void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
+                            const uint32_t h_out, const uint32_t w_out,
+                            const uint32_t k_out, const uint8_t padding_bottom,
+                            const uint8_t padding_right);
+void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
+                           const uint8_t bottom, const uint8_t left,
+                           const uint8_t right, const uint8_t value);
+void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
+                               const uint8_t right, const uint8_t bottom,
+                               const uint8_t left);
+void neureka_task_set_dims(neureka_task_t *task, const uint32_t w_in,
+                        const uint32_t k_in, const uint32_t w_in_stride,
+                        const uint32_t k_in_stride, const uint32_t h_out,
+                        const uint32_t w_out, const uint32_t k_out,
+                        const uint32_t w_out_stride, const uint32_t k_out_stride,
+                        const uint8_t padding_top, const uint8_t padding_bottom,
+                        const uint8_t padding_right,
+                        const uint8_t padding_left);
+void neureka_task_set_dims_stride2x2(
+    neureka_task_t *task, const uint32_t h_in, const uint32_t w_in,
+    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
+    const uint8_t padding_bottom, const uint8_t padding_right,
+    const uint8_t padding_left);
+
+#endif // !__NEUREKA_TASK_H__
diff --git a/neureka/hal/neureka_task_defs.h b/neureka/hal/neureka_task_defs.h
new file mode 100644
index 0000000..daa9897
--- /dev/null
+++ b/neureka/hal/neureka_task_defs.h
@@ -0,0 +1,114 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NEUREKA_DEFS_H__
+#define __NEUREKA_DEFS_H__
+
+/* ARHITECTURE */
+
+#define NEUREKA_FILTER_SIZE (6)
+#define NEUREKA_FILTER_BUFFER_SIZE (8)
+#define NEUREKA_INPUT_CHANNEL_THROUGHPUT (32)
+#define NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 (28)
+#define NEUREKA_OUTPUT_CHANNEL_THROUGHPUT (32)
+#define NEUREKA_WEIGHT_BANDWIDTH (256)
+
+#define NEUREKA_WEIGHT_D0_STRIDE_MODE8 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 8)
+#define NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 (NEUREKA_WEIGHT_BANDWIDTH / 8)
+#define NEUREKA_WEIGHT_D0_STRIDE_MODE16 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 16)
+
+/* TASK REGISTERS */
+
+// job configuration
+#define NEUREKA_REG_WEIGHTS_PTR 0
+#define NEUREKA_REG_INFEAT_PTR 1
+#define NEUREKA_REG_OUTFEAT_PTR 2
+#define NEUREKA_REG_SCALE_PTR 3
+#define NEUREKA_REG_SCALE_SHIFT_PTR 4
+#define NEUREKA_REG_SCALE_BIAS_PTR 5
+#define NEUREKA_REG_INFEAT_D0_STRIDE 6
+#define NEUREKA_REG_INFEAT_D1_STRIDE 7
+#define NEUREKA_REG_INFEAT_D2_STRIDE 8
+#define NEUREKA_REG_OUTFEAT_D0_STRIDE 9
+#define NEUREKA_REG_OUTFEAT_D1_STRIDE 10
+#define NEUREKA_REG_OUTFEAT_D2_STRIDE 11
+#define NEUREKA_REG_WEIGHTS_D0_STRIDE 12
+#define NEUREKA_REG_WEIGHTS_D1_STRIDE 13
+#define NEUREKA_REG_WEIGHTS_D2_STRIDE 14
+#define NEUREKA_REG_SUBTILE_REMAINDER_0 15
+#define NEUREKA_REG_SUBTILE_REMAINDER_1 16
+#define NEUREKA_REG_SUBTILE_REMAINDER_2 17
+#define NEUREKA_REG_SUBTILE_NUMBER_0 18
+#define NEUREKA_REG_SUBTILE_NUMBER_1 19
+#define NEUREKA_REG_PADDING 20
+#define NEUREKA_REG_WEIGHT_OFFSET_FACTOR 21
+#define NEUREKA_REG_FILTER_MASKING 22
+#define NEUREKA_REG_CONF0 23
+
+/*  SHIFT  */
+
+#define NEUREKA_SHIFT_FLAG_NORM_BIAS (25)
+#define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24)
+#define NEUREKA_SHIFT_QUANT_SHIFT (16)
+#define NEUREKA_SHIFT_ROUNDING (11)
+
+/*  CONF0 FLAGS */
+
+#define NEUREKA_FLAG_NORM_BIAS (1 << 25)
+#define NEUREKA_FLAG_NORM_SHIFT (1 << 24)
+#define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23)
+#define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23)
+#define NEUREKA_QUANT_MODE_8BIT (0 << 21)
+#define NEUREKA_QUANT_MODE_16BIT (1 << 21)
+#define NEUREKA_QUANT_MODE_32BIT (2 << 21)
+// conf0[20:16] - quantization shift amount
+#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15)
+#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15)
+#define NEUREKA_FLAG_STREAMIN (1 << 14)
+#define NEUREKA_NORM_MODE_8BIT (0 << 12)
+#define NEUREKA_NORM_MODE_16BIT (1 << 12)
+#define NEUREKA_NORM_MODE_32BIT (2 << 12)
+#define NEUREKA_FLAG_ROUND (1 << 11)
+#define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10)
+#define NEUREKA_FLAG_USE_WMEM (1 << 9)
+#define NEUREKA_FLAG_USE_TCDM (0 << 9)
+#define NEUREKA_FLAG_STRIDE_2x2 (1 << 8) // TODO: Check if the `STRIDED` mode is still `STRIDE_2x2`
+#define NEUREKA_FLAG_LINEAR_MODE (1 << 7)
+#define NEUREKA_FLAG_MODE_3x3 (0 << 5)
+#define NEUREKA_FLAG_MODE_3x3_DW (1 << 5)
+#define NEUREKA_FLAG_MODE_1x1 (2 << 5)
+#define NEUREKA_FLAG_NORM_QUANT (1 << 4)
+#define NEUREKA_FLAG_MODE_BASIC (0 << 3)
+#define NEUREKA_FLAG_MODE16 (1 << 3)
+
+/* Masks */
+
+#define NEUREKA_MASK_QUANT_FUNCTION (1 << 23)
+#define NEUREKA_MASK_QUANT_MODE (3 << 21)
+
+/* PADDING */
+
+#define NEUREKA_DONT_PAD (0)
+#define NEUREKA_MAX_PAD (2)
+
+/* NORM */
+#define NEUREKA_NORM_MAX_LEN (32)
+
+#endif // __NEUREKA_DEFS_H__
diff --git a/neureka/inc/pulp_nnx_defs.h b/neureka/inc/pulp_nnx_defs.h
deleted file mode 100644
index e8ecba5..0000000
--- a/neureka/inc/pulp_nnx_defs.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Luka Macan <luka.macan@unibo.it>
- * Arpan Prasad <prasadar@iis.ee.ethz.ch>
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef __NEUREKA_DEFS_H__
-#define __NEUREKA_DEFS_H__
-
-/* ARHITECTURE */
-
-#define NEUREKA_FILTER_SIZE (6)
-#define NEUREKA_FILTER_BUFFER_SIZE (8)
-#define NEUREKA_INPUT_CHANNEL_THROUGHPUT (32)
-#define NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 (28)
-#define NEUREKA_OUTPUT_CHANNEL_THROUGHPUT (32)
-#define NEUREKA_CONTEXT_SIZE (2)
-#define NEUREKA_WEIGHT_BANDWIDTH (256)
-
-#define NEUREKA_WEIGHT_D0_STRIDE_MODE8 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 8)
-#define NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 (NEUREKA_WEIGHT_BANDWIDTH / 8)
-#define NEUREKA_WEIGHT_D0_STRIDE_MODE16 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 16)
-
-/* REGISTER MAP */
-
-#define NEUREKA_EVT0 12
-#define NEUREKA_EVT1 13
-#define NEUREKA_BASE_ADDR 0x00201000
-#define WEIGHT_MEM_BASE 0x10400000
-#define SRAM_OFFSET 0x00400000
-#define MRAM_OFFSET 0x00000000
-
-// Cluster
-#define CLUSTER_CTRL_BASE_ADDR 0x00200000
-#define CLUSTER_CTRL_HWPE_OFFS 0x18
-#define CLUSTER_CTRL_HWPE_CG_EN_MASK 0x800
-
-/* REGISTER OFFSETS */
-
-// commands
-#define NEUREKA_TRIGGER 0x00
-#define NEUREKA_ACQUIRE 0x04
-#define NEUREKA_FINISHED 0x08
-#define NEUREKA_STATUS 0x0C
-#define NEUREKA_RUNNING_JOB 0x10
-#define NEUREKA_SOFT_CLEAR 0x14
-#define NEUREKA_SWSYNC 0x18
-#define NEUREKA_URISCY_IMEM 0x1C
-
-// job configuration
-#define NEUREKA_REGISTER_OFFSET 0x20
-
-#define NEUREKA_REG_WEIGHTS_PTR 0x00
-#define NEUREKA_REG_INFEAT_PTR 0x04
-#define NEUREKA_REG_OUTFEAT_PTR 0x08
-#define NEUREKA_REG_SCALE_PTR 0x0C
-#define NEUREKA_REG_SCALE_SHIFT_PTR 0x10
-#define NEUREKA_REG_SCALE_BIAS_PTR 0x14
-#define NEUREKA_REG_INFEAT_D0_STRIDE 0x18
-#define NEUREKA_REG_INFEAT_D1_STRIDE 0x1C
-#define NEUREKA_REG_INFEAT_D2_STRIDE 0x20
-#define NEUREKA_REG_OUTFEAT_D0_STRIDE 0x24
-#define NEUREKA_REG_OUTFEAT_D1_STRIDE 0x28
-#define NEUREKA_REG_OUTFEAT_D2_STRIDE 0x2C
-#define NEUREKA_REG_WEIGHTS_D0_STRIDE 0x30
-#define NEUREKA_REG_WEIGHTS_D1_STRIDE 0x34
-#define NEUREKA_REG_WEIGHTS_D2_STRIDE 0x38
-#define NEUREKA_REG_SUBTILE_REMAINDER_0 0x3C
-#define NEUREKA_REG_SUBTILE_REMAINDER_1 0x40
-#define NEUREKA_REG_SUBTILE_REMAINDER_2 0x44
-#define NEUREKA_REG_SUBTILE_NUMBER_0 0x48
-#define NEUREKA_REG_SUBTILE_NUMBER_1 0x4C
-#define NEUREKA_REG_PADDING 0x50
-#define NEUREKA_REG_WEIGHT_OFFSET_FACTOR 0x54
-#define NEUREKA_REG_FILTER_MASKING 0x58
-#define NEUREKA_REG_CONF0 0x5C
-
-// Simulation only
-#define NEUREKA_REG_GVSOC_TRACE 0x60
-
-/*  SHIFT  */
-
-#define NEUREKA_SHIFT_FLAG_NORM_BIAS (25)
-#define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24)
-#define NEUREKA_SHIFT_QUANT_SHIFT (16)
-#define NEUREKA_SHIFT_ROUNDING (11)
-
-/*  CONF0 FLAGS */
-
-#define NEUREKA_FLAG_NORM_BIAS (1 << 25)
-#define NEUREKA_FLAG_NORM_SHIFT (1 << 24)
-#define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23)
-#define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23)
-#define NEUREKA_QUANT_MODE_8BIT (0 << 21)
-#define NEUREKA_QUANT_MODE_16BIT (1 << 21)
-#define NEUREKA_QUANT_MODE_32BIT (2 << 21)
-// conf0[20:16] - quantization shift amount
-#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15)
-#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15)
-#define NEUREKA_FLAG_STREAMIN (1 << 14)
-#define NEUREKA_NORM_MODE_8BIT (0 << 12)
-#define NEUREKA_NORM_MODE_16BIT (1 << 12)
-#define NEUREKA_NORM_MODE_32BIT (2 << 12)
-#define NEUREKA_FLAG_ROUND (1 << 11)
-#define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10)
-#define NEUREKA_FLAG_USE_WMEM (1 << 9)
-#define NEUREKA_FLAG_USE_TCDM (0 << 9)
-#define NEUREKA_FLAG_STRIDED_MODE (1 << 8)
-#define NEUREKA_FLAG_LINEAR_MODE (1 << 7)
-#define NEUREKA_FLAG_MODE_3x3 (0 << 5)
-#define NEUREKA_FLAG_MODE_3x3_DW (1 << 5)
-#define NEUREKA_FLAG_MODE_1x1 (2 << 5)
-#define NEUREKA_FLAG_NORM_QUANT (1 << 4)
-#define NEUREKA_FLAG_MODE_BASIC (0 << 3)
-#define NEUREKA_FLAG_MODE16 (1 << 3)
-
-/* Masks */
-
-#define NEUREKA_MASK_QUANT_FUNCTION (1 << 23)
-#define NEUREKA_MASK_QUANT_MODE (3 << 21)
-
-/* Miscellaneous */
-
-// Padding
-#define MAX_PAD (0xf)
-
-// Normalization
-#define NEUREKA_NORM_MAX_LEN (32)
-#define NO_NORM(length)                                                        \
-  {                                                                            \
-    .scale = scale_identity, .bias = NEUREKA_NULL, .shift = NEUREKA_NULL,      \
-    .length = length, .mode = normMode32Bit                                    \
-  }
-
-// Quantization
-#define NO_QUANT                                                               \
-  {                                                                            \
-    .shift_amount = 0, .mode = quantMode32Bit,                                 \
-    .function = quantFunctionIdentity                                          \
-  }
-
-// GVSOC trace levels
-#define NEUREKA_TRACE_LEVEL_JOB_START_END 0
-#define NEUREKA_TRACE_LEVEL_CONFIG 1
-#define NEUREKA_TRACE_LEVEL_ACTIV_INOUT 2
-#define NEUREKA_TRACE_LEVEL_ALL 3
-
-// null
-#define NEUREKA_NULL ((void *)0)
-#define NEUREKA_STATUS_FULL (0x101)
-
-#endif // __NEUREKA_DEFS_H__
diff --git a/neureka/inc/pulp_nnx_hal.h b/neureka/inc/pulp_nnx_hal.h
deleted file mode 100644
index 40bcec0..0000000
--- a/neureka/inc/pulp_nnx_hal.h
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Luka Macan <luka.macan@unibo.it>
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef __NEUREKA_H__
-#define __NEUREKA_H__
-
-#include <stdint.h>
-
-#include "pulp_nnx_defs.h"
-#include "pulp_nnx_error_codes.h"
-
-#define NEUREKA_CG_ENABLE()                                                    \
-  *(volatile int *)(CLUSTER_CTRL_BASE_ADDR + CLUSTER_CTRL_HWPE_OFFS) |=        \
-      CLUSTER_CTRL_HWPE_CG_EN_MASK
-#define NEUREKA_CG_DISABLE()                                                   \
-  *(volatile int *)(CLUSTER_CTRL_BASE_ADDR + CLUSTER_CTRL_HWPE_OFFS) &=        \
-      ~CLUSTER_CTRL_HWPE_CG_EN_MASK
-
-#define NEUREKA_WRITE(offset, value)                                           \
-  *(int volatile *)(NEUREKA_BASE_ADDR + (offset)) = (value)
-#define NEUREKA_WRITE_BE(offset, value, be)                                    \
-  *(char volatile *)(NEUREKA_BASE_ADDR + (offset) + (be)) = (value)
-#define NEUREKA_READ(offset) *(int volatile *)(NEUREKA_BASE_ADDR + (offset))
-
-#define NEUREKA_WRITE_IO_REG(offset, value)                                    \
-  NEUREKA_WRITE(NEUREKA_REGISTER_OFFSET + (offset), (value))
-#define NEUREKA_WRITE_IO_REG_BE(offset, value, be)                             \
-  NEUREKA_WRITE_BE(NEUREKA_REGISTER_OFFSET + (offset), (value), (be))
-#define NEUREKA_READ_IO_REG(offset)                                            \
-  NEUREKA_READ(NEUREKA_REGISTER_OFFSET + (offset))
-
-#define NEUREKA_BARRIER_NOSTATUS() eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0)
-#define NEUREKA_BARRIER()                                                      \
-  do {                                                                         \
-    eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0);                                  \
-  } while ((*(int volatile *)(NEUREKA_BASE_ADDR + NEUREKA_STATUS)) != 0)
-#define NEUREKA_BUSYWAIT()                                                     \
-  do {                                                                         \
-  } while ((*(int volatile *)(NEUREKA_BASE_ADDR + NEUREKA_STATUS)) != 0)
-#define NEUREKA_BARRIER_ACQUIRE(job_id)                                        \
-  job_id = NEUREKA_READ(NEUREKA_ACQUIRE);                                      \
-  while (job_id < 0) {                                                         \
-    eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0);                                  \
-    job_id = NEUREKA_READ(NEUREKA_ACQUIRE);                                    \
-  };
-#define NEUREKA_NOBARRIER_ACQUIRE(job_id)                                      \
-  job_id = NEUREKA_READ(NEUREKA_ACQUIRE);                                      \
-  while (job_id < 0) {                                                         \
-    job_id = NEUREKA_READ(NEUREKA_ACQUIRE);                                    \
-  };
-
-#define DIVNCEIL(A, B) (((A - 1) / B) + 1)
-#define REMAINDER(A, B) (((A - 1) % B) + 1)
-#define CONCAT_HALF(A, B) (((A & 0xffff) << 16) | (B & 0xffff))
-
-#define NNX_CONTEXT_SIZE NEUREKA_CONTEXT_SIZE
-
-#define FLAG_USED (1)
-#define FLAG_UNUSED (0)
-
-typedef enum {
-  weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC,
-  weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE
-} nnx_weight_offset_mode_e;
-
-typedef struct {
-  void *data;
-  uint16_t height;
-  uint16_t width;
-  uint16_t depth;
-  uint16_t n_weights;
-  uint32_t bitwidth;
-  int32_t offset_factor;
-  nnx_weight_offset_mode_e offset_mode;
-} nnx_weights_t;
-
-typedef enum {
-  featureBitwidth8Bit = 8,
-  featureBitwidth16Bit = 16,
-  featureBitwidth32Bit = 32
-} nnx_feature_bitwidth_e;
-
-typedef struct {
-  void *data;
-  uint16_t height;
-  uint16_t width;
-  uint16_t depth;
-  nnx_feature_bitwidth_e bitwidth;
-} nnx_feature_t;
-
-typedef enum {
-  normMode8Bit = NEUREKA_NORM_MODE_8BIT,
-  normMode16Bit = NEUREKA_NORM_MODE_16BIT,
-  normMode32Bit = NEUREKA_NORM_MODE_32BIT
-} nnx_norm_mode_e;
-
-typedef struct {
-  nnx_norm_mode_e mode;
-  int flag_bias;
-  int flag_shift;
-} nnx_norm_t;
-
-typedef enum {
-  quantMode8Bit = NEUREKA_QUANT_MODE_8BIT,
-  quantMode16Bit = NEUREKA_QUANT_MODE_16BIT,
-  quantMode32Bit = NEUREKA_QUANT_MODE_32BIT
-} nnx_quant_mode_e;
-
-typedef enum {
-  quantFunctionIdentity = NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY,
-  quantFunctionRelu = NEUREKA_FLAG_QUANT_FUNCTION_RELU
-} nnx_quant_function_e;
-
-// TODO: add rounding to quant. Should also be an enum? Best boolean...
-typedef struct {
-  // Shift amount must be in range 0x00-0x1F
-  unsigned shift_amount;
-  nnx_quant_mode_e mode;
-  nnx_quant_function_e function;
-  int flag_rounding;
-} nnx_quant_t;
-
-typedef struct {
-  uint32_t d0;
-  uint32_t d1;
-  uint32_t d2;
-} nnx_stride_t;
-
-typedef struct {
-  uint32_t KoKi;
-  uint32_t HoWo;
-  uint32_t HiWi;
-} nnx_subtile_remainder_t;
-
-typedef struct {
-  uint32_t KoKi;
-  uint32_t HoWo;
-} nnx_subtile_number_t;
-
-typedef struct {
-  nnx_subtile_remainder_t remainder;
-  nnx_subtile_number_t number;
-} nnx_subtile_t;
-
-typedef struct {
-  nnx_stride_t input_stride;
-  nnx_stride_t output_stride;
-  nnx_stride_t weights_stride;
-  nnx_subtile_t subtile;
-  uint32_t padding;
-  uint32_t weight_offset_factor;
-  uint32_t filter_mask;
-  uint32_t conf0;
-} nnx_cfg_t;
-
-typedef struct {
-  uint32_t weights_ptr;
-  uint32_t infeat_ptr;
-  uint32_t outfeat_ptr;
-  uint32_t scale_ptr;
-  uint32_t scale_shift_ptr;
-  uint32_t scale_bias_ptr;
-  nnx_cfg_t cfg;
-} nnx_task_t;
-
-int nnx_job_id();
-int nnx_empty();
-int nnx_full();
-void nnx_soft_clear();
-int nnx_acquire();
-void nnx_offload(nnx_task_t *task);
-void nnx_offload_ptr(nnx_task_t *task);
-void nnx_run_async();
-void nnx_run_blocking();
-void nnx_commit();
-void nnx_wait_empty();
-void nnx_wait_not_full();
-void nnx_wait_on_id(int id);
-void nnx_busywait();
-
-void nnx_task_init(nnx_task_t *task);
-int nnx_pad_input(nnx_cfg_t *cfg, uint32_t top, uint32_t right, uint32_t bottom,
-                  uint32_t left, uint16_t value);
-int nnx_norm_quant(nnx_cfg_t *cfg, nnx_norm_t norm, nnx_quant_t quant);
-void nnx_mask_filter(nnx_cfg_t *cfg, uint8_t top, uint8_t right, uint8_t bottom,
-                     uint8_t left);
-nnx_error_code nnx_conv_1x1(nnx_cfg_t *cfg, nnx_weights_t weights,
-                            nnx_feature_t input, nnx_feature_t output);
-nnx_error_code nnx_conv_1x1_update_dims(nnx_cfg_t *cfg, int h_out, int w_out,
-                                        int k_out, int k_in);
-nnx_error_code nnx_conv_3x3(nnx_cfg_t *cfg, nnx_weights_t weights,
-                            nnx_feature_t input, nnx_feature_t output);
-nnx_error_code nnx_conv_3x3_update_dims(nnx_cfg_t *cfg, int h_out, int w_out,
-                                        int k_out, int k_in);
-nnx_error_code nnx_conv_3x3_dw(nnx_cfg_t *cfg, nnx_weights_t weights,
-                               nnx_feature_t input, nnx_feature_t output);
-nnx_error_code nnx_conv_3x3_dw_update_dims(nnx_cfg_t *cfg, int h_out, int w_out,
-                                           int k_out, int k_in);
-
-#endif /* __NEUREKA_H__ */
diff --git a/neureka/inc/pulp_nnx_util.h b/neureka/inc/pulp_nnx_util.h
deleted file mode 100644
index f29ff3e..0000000
--- a/neureka/inc/pulp_nnx_util.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Luka Macan <luka.macan@unibo.it>
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef __PULP_NNX_UTIL__
-#define __PULP_NNX_UTIL__
-
-void nnx_activate_gvsoc_logging(int use_dec);
-void nnx_deactivate_gvsoc_logging();
-
-#endif /* __PULP_NNX_UTIL__ */
diff --git a/neureka/src/pulp_nnx_hal.c b/neureka/src/pulp_nnx_hal.c
deleted file mode 100644
index 1d99691..0000000
--- a/neureka/src/pulp_nnx_hal.c
+++ /dev/null
@@ -1,412 +0,0 @@
-/*
- * Luka Macan <luka.macan@unibo.it>
- * Arpan Prasad <prasadar@iis.ee.ethz.ch>
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#include "pulp_nnx_hal.h"
-#include "pmsis.h"
-
-static int qw, weight_d0_stride, outbytes;
-
-// TODO For all the following functions we use __builtin_pulp_OffsetedWrite and
-// __builtin_pulp_OffsetedRead instead of classic load/store because otherwise
-// the compiler is not able to correctly factorize the NEUREKA base in case
-// several accesses are done, ending up with twice more code
-
-// __builtin_pulp_OffsetedX not defined - needs further investigation... (too
-// old PULP toolchain? used v1.0.16) It is used inside PULP-SDK...
-
-int nnx_empty() { return !NEUREKA_READ(NEUREKA_STATUS); }
-
-int nnx_full() { return NEUREKA_READ(NEUREKA_STATUS) == NEUREKA_STATUS_FULL; }
-
-int nnx_job_id() { return NEUREKA_READ(NEUREKA_RUNNING_JOB); }
-
-void nnx_soft_clear() {
-  NEUREKA_WRITE(NEUREKA_SOFT_CLEAR, 0);
-  for (volatile int i = 0; i < 10; i++)
-    ;
-}
-
-int nnx_acquire() {
-  int job_id = -1;
-  NEUREKA_BARRIER_ACQUIRE(job_id);
-  return job_id;
-}
-
-void nnx_offload(nnx_task_t *task) {
-  int *task_data = (int *)task;
-  for (int i = 0; i < sizeof(nnx_task_t) / 4; ++i) {
-    NEUREKA_WRITE_IO_REG(i * 4, task_data[i]);
-  }
-}
-
-void nnx_offload_ptr(nnx_task_t *task) {
-  int *task_data = (int *)task;
-  for (int i = 0; i < 6; ++i) {
-    NEUREKA_WRITE_IO_REG(i * 4, task_data[i]);
-  }
-}
-
-void nnx_run_async() { NEUREKA_WRITE(NEUREKA_TRIGGER, 0); }
-
-void nnx_run_blocking() {
-  nnx_run_async();
-  nnx_wait_empty();
-}
-
-void nnx_commit() {
-  NEUREKA_WRITE(NEUREKA_TRIGGER, 1); // commit, no trigger
-}
-
-void nnx_busywait() { NEUREKA_BUSYWAIT(); }
-
-void nnx_wait_empty() {
-  while (!nnx_empty())
-    NEUREKA_BARRIER_NOSTATUS();
-}
-
-void nnx_wait_not_full() {
-  while (nnx_full())
-    NEUREKA_BARRIER_NOSTATUS();
-}
-
-void nnx_wait_on_id(const int id) {
-  while (nnx_job_id() <= id) {
-    eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0);
-  };
-}
-
-void nnx_task_init(nnx_task_t *task) { memset(task, 0, sizeof(nnx_task_t)); }
-
-int nnx_pad_input(nnx_cfg_t *cfg, const uint32_t top, const uint32_t right,
-                  const uint32_t bottom, const uint32_t left,
-                  const uint16_t value) {
-  uint32_t padding = 0;
-  uint32_t flags = 0;
-
-  if (top > MAX_PAD || right > MAX_PAD || bottom > MAX_PAD || left > MAX_PAD) {
-    return 1;
-  }
-
-  cfg->padding =
-      (top << 28) + (right << 24) + (bottom << 20) + (left << 16) + value;
-
-  return 0;
-}
-
-int nnx_norm_quant(nnx_cfg_t *cfg, const nnx_norm_t norm,
-                   const nnx_quant_t quant) {
-  if (quant.shift_amount > 31) {
-    printf("ERROR! quant.shift_amount > 31\n");
-    return 1;
-  }
-
-  if (quant.mode == quantMode16Bit) {
-    printf("ERROR! quant.mode == quantMode16Bit\n");
-    return 1;
-  }
-
-  BIT_SET(cfg->conf0, NEUREKA_FLAG_NORM_QUANT | quant.function | quant.mode |
-                          (quant.shift_amount << 16) |
-                          quant.flag_rounding << NEUREKA_SHIFT_ROUNDING |
-                          norm.mode |
-                          norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS |
-                          norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT);
-
-  return 0;
-}
-
-void nnx_mask_filter(nnx_cfg_t *cfg, const uint8_t top, const uint8_t right,
-                     const uint8_t bottom, const uint8_t left) {
-  cfg->filter_mask = ((uint32_t)top << 24) | ((uint32_t)right << 16) |
-                     ((uint32_t)bottom << 8) | ((uint32_t)left << 0);
-}
-
-nnx_error_code nnx_conv_1x1_update_dims(nnx_cfg_t *cfg, const int h_out,
-                                        const int w_out, const int k_out,
-                                        const int k_in) {
-
-  const int num_Ko = divnceil(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT);
-  const int num_Ki = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT);
-  const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE);
-  const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE);
-
-  const int rem_Ko = remainder(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT);
-  const int rem_Ki = remainder(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT);
-  const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE);
-  const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE);
-  const int rem_Hi = rem_Ho;
-  const int rem_Wi = rem_Wo;
-
-  const nnx_subtile_t subtile = {
-      .number = {.KoKi = concat_half(num_Ko, num_Ki),
-                 .HoWo = concat_half(num_Ho, num_Wo)},
-      .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
-                    .HoWo = concat_half(rem_Ho, rem_Wo),
-                    .HiWi = concat_half(rem_Hi, rem_Wi)}};
-  cfg->subtile = subtile;
-
-  // Strides
-  const nnx_stride_t input_stride = {
-      .d0 = k_in,
-      .d1 = k_in * w_out,
-      .d2 = k_in * 3 * 3 // copying arpan
-  };
-  cfg->input_stride = input_stride;
-
-  const nnx_stride_t output_stride = {
-      .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out};
-  cfg->output_stride = output_stride;
-
-  const nnx_stride_t weights_stride = {
-      .d0 = weight_d0_stride * qw,
-      .d1 = weight_d0_stride * qw * num_Ki,
-      .d2 = 0 // Unused
-  };
-  cfg->weights_stride = weights_stride;
-
-  return 0;
-}
-
-nnx_error_code nnx_conv_1x1(nnx_cfg_t *cfg, const nnx_weights_t weights,
-                            const nnx_feature_t input,
-                            const nnx_feature_t output) {
-  if (weights.bitwidth < 2 || weights.bitwidth > 8) {
-    return weightBitwidthOutOfBounds;
-  }
-
-  if (weights.offset_mode != weightOffsetModeLayerWise) {
-    // Currently only layer-wise mode is used.
-    return unsupportedWeightOffsetMode;
-  }
-
-  if ((input.bitwidth != featureBitwidth8Bit &&
-       input.bitwidth != featureBitwidth16Bit) ||
-      (output.bitwidth != featureBitwidth8Bit &&
-       output.bitwidth != featureBitwidth32Bit)) {
-    return unsupportedFeatureBitwidth;
-  }
-
-  if (input.height != output.height || input.width != output.width ||
-      input.depth != weights.depth || output.depth != weights.n_weights) {
-    return dimensionMismatch;
-  }
-
-  const int mode16 =
-      input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC;
-
-  BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_1x1 | mode16 |
-                          (weights.bitwidth - 1));
-
-  // Global static variables needed by update_dims
-  outbytes = output.bitwidth / 8;
-  weight_d0_stride =
-      mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8;
-  qw = weights.bitwidth;
-
-  nnx_conv_1x1_update_dims(cfg, output.height, output.width, output.depth,
-                           input.depth);
-
-  // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth);
-  cfg->weight_offset_factor = weights.offset_factor;
-
-  return 0;
-}
-
-nnx_error_code nnx_conv_3x3_update_dims(nnx_cfg_t *cfg, const int h_out,
-                                        const int w_out, const int k_out,
-                                        const int k_in) {
-
-  const int num_Ko = divnceil(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT);
-  const int num_Ki = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3);
-  const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE);
-  const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE);
-
-  const int rem_Ko = remainder(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT);
-  const int rem_Ki = remainder(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3);
-  const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE);
-  const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE);
-  const int rem_Hi = rem_Ho + 2;
-  const int rem_Wi = rem_Wo + 2;
-
-  const nnx_subtile_t subtile = {
-      .number = {.KoKi = concat_half(num_Ko, num_Ki),
-                 .HoWo = concat_half(num_Ho, num_Wo)},
-      .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
-                    .HoWo = concat_half(rem_Ho, rem_Wo),
-                    .HiWi = concat_half(rem_Hi, rem_Wi)}};
-  cfg->subtile = subtile;
-
-  // Strides
-  const nnx_stride_t input_stride = {.d0 = k_in,
-                                     .d1 = k_in * (w_out + 2),
-                                     .d2 = k_in * NEUREKA_FILTER_BUFFER_SIZE *
-                                           NEUREKA_FILTER_BUFFER_SIZE};
-  cfg->input_stride = input_stride;
-
-  const nnx_stride_t output_stride = {
-      .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out};
-  cfg->output_stride = output_stride;
-
-  const nnx_stride_t weights_stride = {
-      .d0 = NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3,
-      .d1 = NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 * qw * num_Ki,
-      .d2 = 0 // Unused
-  };
-  cfg->weights_stride = weights_stride;
-
-  return 0;
-}
-
-nnx_error_code nnx_conv_3x3(nnx_cfg_t *cfg, const nnx_weights_t weights,
-                            const nnx_feature_t input,
-                            const nnx_feature_t output) {
-  if (weights.bitwidth < 2 || weights.bitwidth > 8) {
-    return weightBitwidthOutOfBounds;
-  }
-
-  if (weights.offset_mode != weightOffsetModeLayerWise) {
-    // Currently only layer-wise mode is used.
-    return unsupportedWeightOffsetMode;
-  }
-
-  if ((input.bitwidth != featureBitwidth8Bit &&
-       input.bitwidth != featureBitwidth16Bit) ||
-      (output.bitwidth != featureBitwidth8Bit &&
-       output.bitwidth != featureBitwidth32Bit)) {
-    return unsupportedFeatureBitwidth;
-  }
-
-  if (input.height - 2 != output.height || input.width - 2 != output.width ||
-      input.depth != weights.depth || output.depth != weights.n_weights) {
-    return dimensionMismatch;
-  }
-
-  const int mode16 =
-      input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC;
-
-  BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_3x3 | mode16 |
-                          (weights.bitwidth - 1));
-
-  // Global static variables needed by update_dims
-  outbytes = output.bitwidth / 8;
-  weight_d0_stride =
-      mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8;
-  qw = weights.bitwidth;
-
-  nnx_conv_3x3_update_dims(cfg, output.height, output.width, output.depth,
-                           input.depth);
-
-  // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth);
-  cfg->weight_offset_factor = weights.offset_factor;
-
-  return 0;
-}
-
-nnx_error_code nnx_conv_3x3_dw_update_dims(nnx_cfg_t *cfg, const int h_out,
-                                           const int w_out, const int k_out,
-                                           const int k_in) {
-
-  const int num_Ko = divnceil(k_out, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3);
-  const int num_Ki = num_Ko;
-  const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE);
-  const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE);
-
-  const int rem_Ko = remainder(k_out, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3);
-  const int rem_Ki = rem_Ko;
-  const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE);
-  const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE);
-  const int rem_Hi = rem_Ho + 2;
-  const int rem_Wi = rem_Wo + 2;
-
-  const nnx_subtile_t subtile = {
-      .number = {.KoKi = concat_half(num_Ko, num_Ki),
-                 .HoWo = concat_half(num_Ho, num_Wo)},
-      .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
-                    .HoWo = concat_half(rem_Ho, rem_Wo),
-                    .HiWi = concat_half(rem_Hi, rem_Wi)}};
-  cfg->subtile = subtile;
-
-  // Strides
-  const nnx_stride_t input_stride = {
-      .d0 = k_out,
-      .d1 = k_out * (w_out + 2),
-      .d2 = 0 // Unused
-  };
-  cfg->input_stride = input_stride;
-
-  const nnx_stride_t output_stride = {
-      .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out};
-  cfg->output_stride = output_stride;
-
-  const nnx_stride_t weights_stride = {
-      .d0 = NEUREKA_FILTER_SIZE * NEUREKA_FILTER_SIZE * weight_d0_stride,
-      .d1 = 0,
-      .d2 = 0 // Unused
-  };
-  cfg->weights_stride = weights_stride;
-
-  return 0;
-}
-
-nnx_error_code nnx_conv_3x3_dw(nnx_cfg_t *cfg, const nnx_weights_t weights,
-                               const nnx_feature_t input,
-                               const nnx_feature_t output) {
-  if (weights.bitwidth < 2 || weights.bitwidth > 8) {
-    return weightBitwidthOutOfBounds;
-  }
-
-  if (weights.offset_mode != weightOffsetModeLayerWise) {
-    // Currently only layer-wise mode is used.
-    return unsupportedWeightOffsetMode;
-  }
-
-  if ((input.bitwidth != featureBitwidth8Bit &&
-       input.bitwidth != featureBitwidth16Bit) ||
-      (output.bitwidth != featureBitwidth8Bit &&
-       output.bitwidth != featureBitwidth32Bit)) {
-    return unsupportedFeatureBitwidth;
-  }
-
-  if (input.height - 2 != output.height || input.width - 2 != output.width ||
-      input.depth != output.depth) {
-    return dimensionMismatch;
-  }
-
-  const int mode16 =
-      input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC;
-
-  BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_3x3_DW | mode16 |
-                          (weights.bitwidth - 1));
-
-  // Global static variables needed by update_dims
-  outbytes = output.bitwidth / 8;
-  weight_d0_stride =
-      mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8;
-  qw = weights.bitwidth;
-
-  nnx_conv_3x3_dw_update_dims(cfg, output.height, output.width, output.depth,
-                              input.depth);
-
-  // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth);
-  cfg->weight_offset_factor = weights.offset_factor;
-
-  return 0;
-}
diff --git a/src/pulp_nnx_neureka.c b/src/pulp_nnx_neureka.c
new file mode 100644
index 0000000..440ec07
--- /dev/null
+++ b/src/pulp_nnx_neureka.c
@@ -0,0 +1,131 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "pulp_nnx_neureka.h"
+#include "hwpe.h"
+#include "neureka.h"
+#include "pulp_nnx_util.h"
+#include <pmsis.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+void neureka_nnx_init(neureka_dev_t *dev, neureka_siracusa_conf_t *conf) {
+  neureka_siracusa_open(conf);
+  hwpe_soft_clear(&dev->hwpe_dev);
+}
+
+void neureka_nnx_term(neureka_dev_t *dev) {
+  hwpe_soft_clear(&dev->hwpe_dev);
+  neureka_siracusa_close();
+}
+
+int neureka_nnx_dispatch_check(neureka_dev_t *dev) {
+  return !neureka_task_queue_full(dev);
+}
+
+void neureka_nnx_dispatch_wait(neureka_dev_t *dev) {
+  while (!neureka_nnx_dispatch_check(dev)) {
+    neureka_siracusa_event_wait_and_clear();
+  }
+}
+
+int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task) {
+  if (hwpe_task_queue_acquire_task(&dev->hwpe_dev, &task->id)) {
+    return 1;
+  }
+  hwpe_task_queue_write_task(&dev->hwpe_dev, (uint32_t *)&task->data,
+                             (int)(sizeof(neureka_task_data_t) / 4));
+  hwpe_task_queue_release_and_run(&dev->hwpe_dev);
+  return 0;
+}
+
+int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task) {
+#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
+  // GVSOC model has a broken running_id so resolve_check
+  // conservativly looks if the task queue is empty.
+  return neureka_task_queue_empty(dev);
+#else
+  uint8_t prev_task_id = task->id - 1;
+  return !(hwpe_last_task_id(&dev->hwpe_dev) == prev_task_id ||
+           (hwpe_last_task_id(&dev->hwpe_dev) == task->id &&
+            !neureka_task_queue_empty(dev)));
+#endif
+}
+
+void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task) {
+  while (!neureka_nnx_resolve_check(dev, task)) {
+    neureka_siracusa_event_wait_and_clear();
+  }
+}
+
+static inline uint32_t _get_tile_ptr(uint32_t ptr, int i, int j, int size_i,
+                                     uint32_t size_j, uint32_t size_k,
+                                     uint32_t stride_j, uint32_t stride_k,
+                                     uint32_t overlap_i, uint32_t overlap_j,
+                                     uint32_t offset_i, uint32_t offset_j,
+                                     uint8_t data_size) {
+  return ptr +
+         (i * (size_i - overlap_i) - offset_i) * stride_j * stride_k *
+             data_size / 8 +
+         (j * (size_j - overlap_j) - offset_j) * stride_k * data_size / 8;
+}
+
+void neureka_nnx_dispatch_stride2x2(
+    neureka_dev_t *dev, neureka_task_t *task, const uint32_t w_in, const uint32_t k_in,
+    const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint8_t h_ker, const uint8_t w_ker) {
+  const uint8_t stride = 2;
+  const uint8_t bits = 8;
+
+  const uint32_t n_h = divnceil(h_out, stride);
+  const uint32_t n_w = divnceil(w_out, stride);
+  const uint32_t input_height_offset = h_out % stride == 1 ? stride : 0;
+  const uint32_t input_width_offset = w_out % stride == 1 ? stride : 0;
+  const uint32_t output_height_offset = h_out % stride == 1 ? 1 : 0;
+  const uint32_t output_width_offset = w_out % stride == 1 ? 1 : 0;
+
+  const uint32_t input_base = task->data.infeat_ptr;
+  const uint32_t output_base = task->data.outfeat_ptr;
+  const uint32_t tile_padding = task->data.cfg.padding;
+
+  for (int i = 0; i < n_h; i++) {
+    for (int j = 0; j < n_w; j++) {
+      task->data.infeat_ptr =
+          _get_tile_ptr(input_base, i, j, 3 + h_ker - 1, 3 + w_ker - 1, k_in,
+                        w_in_stride, k_in_stride, h_ker - stride,
+                        w_ker - stride, i == 0 ? 0 : input_height_offset,
+                        j == 0 ? 0 : input_width_offset, bits);
+      task->data.outfeat_ptr =
+          _get_tile_ptr(output_base, i, j, 2, 2, k_out, w_out_stride,
+                        k_out_stride, 0, 0, i == 0 ? 0 : output_height_offset,
+                        j == 0 ? 0 : output_width_offset, bits);
+
+      task->data.cfg.padding =
+          neureka_get_tile_padding(tile_padding, i, j, n_h, n_w);
+
+      // Altered dispatch to wait if cannot acquire
+      while (neureka_nnx_dispatch(dev, task)) {
+        neureka_siracusa_event_wait_and_clear();
+      }
+    }
+  }
+}

From e09641512dd141de49967f098231f4246d084516 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Sun, 14 Jan 2024 19:52:28 +0100
Subject: [PATCH 02/72] add neureka support to test app

---
 test/app/Makefile        |   7 ++-
 test/app/src/nnx_layer.c | 119 +++++++++++++++++++++++++++++++--------
 2 files changed, 100 insertions(+), 26 deletions(-)

diff --git a/test/app/Makefile b/test/app/Makefile
index 14f30fd..493c092 100644
--- a/test/app/Makefile
+++ b/test/app/Makefile
@@ -40,6 +40,8 @@ INC_DIRS += $(ACC_DIR)/hal $(ACC_DIR)/gvsoc $(ACC_DIR)/bsp
 INC_DIRS += gen/inc
 
 INC_FLAGS += $(addprefix -I,$(INC_DIRS))
+APP_CFLAGS += $(INC_FLAGS)
+
 
 # Source files
 
@@ -58,7 +60,10 @@ APP_SRCS += $(wildcard gen/src/*.c)
 
 # Flags
 
-APP_CFLAGS += $(INC_FLAGS) -O2 -w -Wall -Werror -flto
+ACCELERATOR_UPPERCASE := $(shell echo $(ACCELERATOR) | tr [:lower:] [:upper:])
+APP_CFLAGS += -DNNX_ACCELERATOR=$(ACCELERATOR) -DNNX_$(ACCELERATOR_UPPERCASE)
+
+APP_CFLAGS += -O2 -w -Wall -Werror -flto
 APP_LDFLAGS += -flto
 
 include $(RULES_DIR)/pmsis_rules.mk
diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index ffd93a1..414f0a3 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -19,12 +19,81 @@
  */
 
 #include "nnx_layer.h"
+#include <pmsis.h>
+
+#ifdef NNX_NE16
+
 #include "ne16.h"
 #include "ne16_gvsoc.h"
 #include "ne16_pulp_bsp.h"
 #include "ne16_task.h"
 #include "pulp_nnx_ne16.h"
-#include <pmsis.h>
+
+typedef ne16_quant_t nnx_quant_t;
+typedef ne16_norm_t nnx_norm_t;
+typedef ne16_task_t nnx_task_t;
+typedef ne16_dev_t nnx_dev_t;
+typedef ne16_pulp_conf_t nnx_bsp_conf_t;
+
+#define nnxTaskFlagTrue ne16TaskFlagTrue
+#define nnxTaskFlagFalse ne16TaskFlagFalse
+
+#define nnx_task_init ne16_task_init
+#define nnx_task_set_dims ne16_task_set_dims
+#define nnx_task_set_dims_stride2x2 ne16_task_set_dims_stride2x2
+#define nnx_task_set_ptrs ne16_task_set_ptrs
+
+#define NNX_GVSOC_LOG_LEVEL_CONFIG NE16_GVSOC_LOG_LEVEL_CONFIG
+#define NNX_GVSOC_LOG_FORMAT_HEXADECIMAL NE16_GVSOC_LOG_FORMAT_HEXADECIMAL
+#define nnx_gvsoc_log_activate ne16_gvsoc_log_activate
+#define nnx_gvsoc_log_deactivate ne16_gvsoc_log_deactivate
+
+#define nnx_bsp_get_dev ne16_pulp_get_dev
+
+#define nnx_init ne16_nnx_init
+#define nnx_dispatch_wait ne16_nnx_dispatch_wait
+#define nnx_dispatch_stride2x2 ne16_nnx_dispatch_stride2x2
+#define nnx_dispatch ne16_nnx_dispatch
+#define nnx_resolve_wait ne16_nnx_resolve_wait
+#define nnx_term ne16_nnx_term
+
+#elif defined NNX_NEUREKA
+
+#include "neureka.h"
+#include "neureka_gvsoc.h"
+#include "neureka_siracusa_bsp.h"
+#include "neureka_task.h"
+#include "pulp_nnx_neureka.h"
+
+typedef neureka_quant_t nnx_quant_t;
+typedef neureka_norm_t nnx_norm_t;
+typedef neureka_task_t nnx_task_t;
+typedef neureka_dev_t nnx_dev_t;
+typedef neureka_siracusa_conf_t nnx_bsp_conf_t;
+
+#define nnxTaskFlagTrue neurekaTaskFlagTrue
+#define nnxTaskFlagFalse neurekaTaskFlagFalse
+
+#define nnx_task_init neureka_task_init
+#define nnx_task_set_dims neureka_task_set_dims
+#define nnx_task_set_dims_stride2x2 neureka_task_set_dims_stride2x2
+#define nnx_task_set_ptrs neureka_task_set_ptrs
+
+#define NNX_GVSOC_LOG_LEVEL_CONFIG NEUREKA_GVSOC_LOG_LEVEL_CONFIG
+#define NNX_GVSOC_LOG_FORMAT_HEXADECIMAL NEUREKA_GVSOC_LOG_FORMAT_HEXADECIMAL
+#define nnx_gvsoc_log_activate neureka_gvsoc_log_activate
+#define nnx_gvsoc_log_deactivate neureka_gvsoc_log_deactivate
+
+#define nnx_bsp_get_dev neureka_siracusa_get_dev
+
+#define nnx_init neureka_nnx_init
+#define nnx_dispatch_wait neureka_nnx_dispatch_wait
+#define nnx_dispatch_stride2x2 neureka_nnx_dispatch_stride2x2
+#define nnx_dispatch neureka_nnx_dispatch
+#define nnx_resolve_wait neureka_nnx_resolve_wait
+#define nnx_term neureka_nnx_term
+
+#endif // NNX_NE16 || NNX_NEUREKA
 
 // Generated headers
 #include "bias.h"
@@ -34,34 +103,34 @@
 #include "scale.h"
 #include "weight.h"
 
-static void task_prepare(ne16_task_t *task) {
-  ne16_task_init(task, WEIGHT_HEIGHT, GROUPS > 1, INPUT_BITS, OUTPUT_BITS,
+static void task_prepare(nnx_task_t *task) {
+  nnx_task_init(task, WEIGHT_HEIGHT, GROUPS > 1, INPUT_BITS, OUTPUT_BITS,
                  WEIGHT_BITS, weightOffsetModeLayerWise, WEIGHT_OFFSET,
-                 (ne16_quant_t){.shift_amount = OUTSHIFT,
+                 (neureka_quant_t){.shift_amount = OUTSHIFT,
                                 .mode = quantMode8Bit,
                                 .function = HAS_RELU ? quantFunctionRelu
                                                      : quantFunctionIdentity,
-                                .flag_rounding = ne16TaskFlagFalse},
-                 (ne16_norm_t){.mode = normMode8Bit,
-                               .flag_bias = HAS_BIAS ? ne16TaskFlagTrue
-                                                     : ne16TaskFlagFalse,
-                               .flag_shift = ne16TaskFlagFalse},
+                                .flag_rounding = nnxTaskFlagFalse},
+                 (neureka_norm_t){.mode = normMode8Bit,
+                               .flag_bias = HAS_BIAS ? nnxTaskFlagTrue
+                                                     : nnxTaskFlagFalse,
+                               .flag_shift = nnxTaskFlagFalse},
                  STRIDE_HEIGHT);
 
   if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) {
-    ne16_task_set_dims_stride2x2(
+    nnx_task_set_dims_stride2x2(
         task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
         INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL,
         OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP,
         PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT);
   } else {
-    ne16_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
+    nnx_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
                        INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
                        OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, PADDING_TOP,
                        PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT);
   }
 
-  ne16_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, INPUT_CHANNEL,
+  nnx_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, INPUT_CHANNEL,
                      INPUT_BITS, PADDING_TOP, PADDING_LEFT, (uint32_t)output,
                      (uint32_t)weight, (uint32_t)scale, NULL,
 #if HAS_BIAS == 1
@@ -72,35 +141,35 @@ static void task_prepare(ne16_task_t *task) {
   );
 }
 
-static void task_execute(ne16_task_t *task) {
-  ne16_dev_t *dev = ne16_pulp_get_dev();
+static void task_execute(nnx_task_t *task) {
+  nnx_dev_t *dev = nnx_bsp_get_dev();
 
-  ne16_gvsoc_log_activate(dev, NE16_GVSOC_LOG_LEVEL_CONFIG,
-                          NE16_GVSOC_LOG_FORMAT_HEXADECIMAL);
+  nnx_gvsoc_log_activate(dev, NNX_GVSOC_LOG_LEVEL_CONFIG,
+                          NNX_GVSOC_LOG_FORMAT_HEXADECIMAL);
 
-  ne16_pulp_conf_t conf = {.max_stall = 8};
-  ne16_nnx_init(dev, &conf);
+  nnx_bsp_conf_t conf = {.max_stall = 8};
+  nnx_init(dev, &conf);
 
-  ne16_nnx_dispatch_wait(dev);
+  nnx_dispatch_wait(dev);
 
   if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) {
-    ne16_nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
+    nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
                                 INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
                                 OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL,
                                 WEIGHT_HEIGHT, WEIGHT_WIDTH);
   } else {
-    ne16_nnx_dispatch(dev, task);
+    nnx_dispatch(dev, task);
   }
 
-  ne16_nnx_resolve_wait(dev, task);
+  nnx_resolve_wait(dev, task);
 
-  ne16_nnx_term(dev);
+  nnx_term(dev);
 
-  ne16_gvsoc_log_deactivate(dev);
+  nnx_gvsoc_log_deactivate(dev);
 }
 
 void execute_nnx_layer(void *args) {
-  ne16_task_t task;
+  nnx_task_t task;
   task_prepare(&task);
   task_execute(&task);
 }

From c480025c03e38b5951fd89f60a79fcc8c75bd748 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Sun, 14 Jan 2024 21:46:33 +0100
Subject: [PATCH 03/72] Add weight mem source flag

---
 neureka/hal/neureka_task.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
index 943c373..8b0d559 100644
--- a/neureka/hal/neureka_task.c
+++ b/neureka/hal/neureka_task.c
@@ -76,8 +76,8 @@ void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
       NEUREKA_FLAG_NORM_QUANT | quant.function | quant.mode |
       (quant.shift_amount << 16) | quant.flag_rounding << NEUREKA_SHIFT_ROUNDING |
       norm.mode | norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS |
-      norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT | weights_offset_mode |
-      flag_mode | flag_mode16 | (weights_bits - 1) | flag_stride2x2;
+      norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT | NEUREKA_FLAG_USE_TCDM |
+      weights_offset_mode | flag_mode | flag_mode16 | (weights_bits - 1) | flag_stride2x2;
 
   task->data.cfg.weight_offset_factor = weights_offset_factor;
 }

From 953db1eb38d9a60db31c113a3840042a3a22e3f5 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 15 Jan 2024 08:22:57 +0100
Subject: [PATCH 04/72] Fix formatting

---
 inc/pulp_nnx_ne16.h                |   8 +-
 inc/pulp_nnx_neureka.h             |  16 ++--
 ne16/hal/ne16_task.c               |   8 +-
 ne16/hal/ne16_task.h               |   5 +-
 neureka/bsp/neureka_siracusa_bsp.c |   9 +-
 neureka/hal/neureka_task.c         | 127 +++++++++++++++--------------
 neureka/hal/neureka_task.h         |  68 +++++++--------
 neureka/hal/neureka_task_defs.h    |   3 +-
 src/pulp_nnx_ne16.c                |   4 +-
 src/pulp_nnx_neureka.c             |   4 +-
 test/app/src/nnx_layer.c           |  44 +++++-----
 11 files changed, 157 insertions(+), 139 deletions(-)

diff --git a/inc/pulp_nnx_ne16.h b/inc/pulp_nnx_ne16.h
index eff9a60..7bbda6d 100644
--- a/inc/pulp_nnx_ne16.h
+++ b/inc/pulp_nnx_ne16.h
@@ -43,7 +43,8 @@ void ne16_nnx_dispatch_wait(ne16_dev_t *dev);
 /** ne16_nnx_dispatch
  *
  * Dispatch a task to the accelerator.
- * Fails with return code 1 if the task cannot be dispatched. Otherwise returns 0.
+ * Fails with return code 1 if the task cannot be dispatched. Otherwise returns
+ * 0.
  */
 int ne16_nnx_dispatch(ne16_dev_t *dev, ne16_task_t *task);
 
@@ -59,7 +60,6 @@ int ne16_nnx_resolve_check(ne16_dev_t *dev, ne16_task_t *task);
  */
 void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task);
 
-
 /* Additional helper functions */
 
 /** ne16_nnx_dispatch_stride2x2
@@ -70,8 +70,8 @@ void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task);
  * Works only if the k_out is divisible by 2.
  */
 void ne16_nnx_dispatch_stride2x2(
-    ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in, const uint32_t k_in,
-    const uint32_t w_in_stride, const uint32_t k_in_stride,
+    ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in,
+    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
     const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
     const uint32_t w_out_stride, const uint32_t k_out_stride,
     const uint8_t h_ker, const uint8_t w_ker);
diff --git a/inc/pulp_nnx_neureka.h b/inc/pulp_nnx_neureka.h
index cabf30a..b811f25 100644
--- a/inc/pulp_nnx_neureka.h
+++ b/inc/pulp_nnx_neureka.h
@@ -43,7 +43,8 @@ void neureka_nnx_dispatch_wait(neureka_dev_t *dev);
 /** neureka_nnx_dispatch
  *
  * Dispatch a task to the accelerator.
- * Fails with return code 1 if the task cannot be dispatched. Otherwise returns 0.
+ * Fails with return code 1 if the task cannot be dispatched. Otherwise returns
+ * 0.
  */
 int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task);
 
@@ -59,19 +60,18 @@ int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task);
  */
 void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task);
 
-
 /* Additional helper functions */
 
 /** neureka_nnx_dispatch_stride2x2
  *
- * It uses Neureka's 2x2 strided mode which reduces the number of writes Neureka does.
- * This mode doesn't stride the Neureka's subtile input pointer, so we have to
- * tile the tile to the subtile's spatial dimensions (in this case 3x3 output).
- * Works only if the k_out is divisible by 2.
+ * It uses Neureka's 2x2 strided mode which reduces the number of writes Neureka
+ * does. This mode doesn't stride the Neureka's subtile input pointer, so we
+ * have to tile the tile to the subtile's spatial dimensions (in this case 3x3
+ * output). Works only if the k_out is divisible by 2.
  */
 void neureka_nnx_dispatch_stride2x2(
-    neureka_dev_t *dev, neureka_task_t *task, const uint32_t w_in, const uint32_t k_in,
-    const uint32_t w_in_stride, const uint32_t k_in_stride,
+    neureka_dev_t *dev, neureka_task_t *task, const uint32_t w_in,
+    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
     const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
     const uint32_t w_out_stride, const uint32_t k_out_stride,
     const uint8_t h_ker, const uint8_t w_ker);
diff --git a/ne16/hal/ne16_task.c b/ne16/hal/ne16_task.c
index 0ba54d5..b0a4337 100644
--- a/ne16/hal/ne16_task.c
+++ b/ne16/hal/ne16_task.c
@@ -195,8 +195,9 @@ void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
                         const uint32_t k_in, const uint32_t w_in_stride,
                         const uint32_t k_in_stride, const uint32_t h_out,
                         const uint32_t w_out, const uint32_t k_out,
-                        const uint32_t w_out_stride, const uint32_t k_out_stride,
-                        const uint8_t padding_top, const uint8_t padding_bottom,
+                        const uint32_t w_out_stride,
+                        const uint32_t k_out_stride, const uint8_t padding_top,
+                        const uint8_t padding_bottom,
                         const uint8_t padding_right,
                         const uint8_t padding_left) {
   ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
@@ -220,7 +221,8 @@ void ne16_task_set_dims_stride2x2(
   ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
                         k_out_stride);
   ne16_task_set_counters(task, k_in, h_out > 1 ? 3 : 1, w_out > 1 ? 3 : 1,
-                         k_out, h_in + padding_top >= 5 ? 0 : padding_bottom, 0);
+                         k_out, h_in + padding_top >= 5 ? 0 : padding_bottom,
+                         0);
 
   const uint8_t padding_bottom_new =
       (h_in + padding_top - h_ker) % stride == 0 ? 0 : padding_bottom;
diff --git a/ne16/hal/ne16_task.h b/ne16/hal/ne16_task.h
index df16b6c..0823b81 100644
--- a/ne16/hal/ne16_task.h
+++ b/ne16/hal/ne16_task.h
@@ -156,8 +156,9 @@ void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
                         const uint32_t k_in, const uint32_t w_in_stride,
                         const uint32_t k_in_stride, const uint32_t h_out,
                         const uint32_t w_out, const uint32_t k_out,
-                        const uint32_t w_out_stride, const uint32_t k_out_stride,
-                        const uint8_t padding_top, const uint8_t padding_bottom,
+                        const uint32_t w_out_stride,
+                        const uint32_t k_out_stride, const uint8_t padding_top,
+                        const uint8_t padding_bottom,
                         const uint8_t padding_right,
                         const uint8_t padding_left);
 void ne16_task_set_dims_stride2x2(
diff --git a/neureka/bsp/neureka_siracusa_bsp.c b/neureka/bsp/neureka_siracusa_bsp.c
index 78ef09a..5021e3f 100644
--- a/neureka/bsp/neureka_siracusa_bsp.c
+++ b/neureka/bsp/neureka_siracusa_bsp.c
@@ -23,8 +23,9 @@
 
 #define NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR (0x00200000)
 #define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS 0x18
-#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR                                       \
-  (NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR + NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS)
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR                                \
+  (NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR +                                   \
+   NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS)
 #define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_CG_EN 0x800
 #define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO 0x100
 #define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL 0xff
@@ -89,4 +90,6 @@ static const neureka_dev_t neureka_siracusa_dev = {
     .hwpe_dev = (struct hwpe_dev_t){
         .base_addr = (volatile uint32_t *)NEUREKA_SIRACUSA_BASE_ADDR}};
 
-const neureka_dev_t *neureka_siracusa_get_dev() { return &neureka_siracusa_dev; }
+const neureka_dev_t *neureka_siracusa_get_dev() {
+  return &neureka_siracusa_dev;
+}
diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
index 8b0d559..9c5f30c 100644
--- a/neureka/hal/neureka_task.c
+++ b/neureka/hal/neureka_task.c
@@ -23,8 +23,8 @@
 #include "pulp_nnx_util.h"
 
 inline uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
-                                      uint32_t i_width, uint32_t n_height,
-                                      uint32_t n_width) {
+                                         uint32_t i_width, uint32_t n_height,
+                                         uint32_t n_width) {
   uint32_t tile_padding = padding;
   if (i_height > 0) {
     tile_padding &= ~(0xf << 28);
@@ -45,23 +45,26 @@ void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
                        const uint8_t depthwise, const uint8_t input_bits,
                        const uint8_t output_bits, const uint8_t weights_bits,
                        const neureka_weight_offset_mode_e weights_offset_mode,
-                       const uint32_t weights_offset_factor, neureka_quant_t quant,
-                       neureka_norm_t norm, const uint8_t stride) {
+                       const uint32_t weights_offset_factor,
+                       neureka_quant_t quant, neureka_norm_t norm,
+                       const uint8_t stride) {
   const uint32_t flag_mode16 =
       input_bits == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC;
 
   *task = (neureka_task_t){
       .outbytes = output_bits / 8,
       .weight_d0_stride = flag_mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16
-                                      : kernel_shape == 3 ? 
-                                        NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 :
-                                        NEUREKA_WEIGHT_D0_STRIDE_MODE8,
+                          : kernel_shape == 3
+                              ? NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3
+                              : NEUREKA_WEIGHT_D0_STRIDE_MODE8,
       .qw = weights_bits,
       .stride_shift = stride == 2 ? 1 : 0,
-      .output_channel_throughput = depthwise ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3
-                                             : NEUREKA_OUTPUT_CHANNEL_THROUGHPUT,
-      .input_channel_throughput = kernel_shape == 3 ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3
-                                                    : NEUREKA_INPUT_CHANNEL_THROUGHPUT,
+      .output_channel_throughput = depthwise
+                                       ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3
+                                       : NEUREKA_OUTPUT_CHANNEL_THROUGHPUT,
+      .input_channel_throughput = kernel_shape == 3
+                                      ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3
+                                      : NEUREKA_INPUT_CHANNEL_THROUGHPUT,
       .kernel_shape = kernel_shape,
       .depthwise = depthwise,
       .data = {0}};
@@ -74,10 +77,12 @@ void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
 
   task->data.cfg.conf0 |=
       NEUREKA_FLAG_NORM_QUANT | quant.function | quant.mode |
-      (quant.shift_amount << 16) | quant.flag_rounding << NEUREKA_SHIFT_ROUNDING |
-      norm.mode | norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS |
+      (quant.shift_amount << 16) |
+      quant.flag_rounding << NEUREKA_SHIFT_ROUNDING | norm.mode |
+      norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS |
       norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT | NEUREKA_FLAG_USE_TCDM |
-      weights_offset_mode | flag_mode | flag_mode16 | (weights_bits - 1) | flag_stride2x2;
+      weights_offset_mode | flag_mode | flag_mode16 | (weights_bits - 1) |
+      flag_stride2x2;
 
   task->data.cfg.weight_offset_factor = weights_offset_factor;
 }
@@ -89,20 +94,20 @@ void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
  * Necessary for input pointer when it's padded.
  */
 inline uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
-                             const uint32_t channel, const uint8_t bits,
-                             const uint8_t padding_top,
-                             const uint8_t padding_left) {
+                                const uint32_t channel, const uint8_t bits,
+                                const uint8_t padding_top,
+                                const uint8_t padding_left) {
   return ptr - (padding_top * width + padding_left) * channel * bits / 8;
 }
 
 inline void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
-                               uint32_t w_in, uint32_t k_in, uint8_t bits_in,
-                               uint8_t padding_top, uint8_t padding_left,
-                               uint32_t output_ptr, uint32_t weights_ptr,
-                               uint32_t scale_ptr, uint32_t shift_ptr,
-                               uint32_t bias_ptr) {
-  task->data.infeat_ptr =
-      neureka_pad_ptr(input_ptr, w_in, k_in, bits_in, padding_top, padding_left);
+                                  uint32_t w_in, uint32_t k_in, uint8_t bits_in,
+                                  uint8_t padding_top, uint8_t padding_left,
+                                  uint32_t output_ptr, uint32_t weights_ptr,
+                                  uint32_t scale_ptr, uint32_t shift_ptr,
+                                  uint32_t bias_ptr) {
+  task->data.infeat_ptr = neureka_pad_ptr(input_ptr, w_in, k_in, bits_in,
+                                          padding_top, padding_left);
   task->data.outfeat_ptr = output_ptr;
   task->data.weights_ptr = weights_ptr;
   task->data.scale_ptr = scale_ptr;
@@ -111,18 +116,20 @@ inline void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
 }
 
 void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
-                           const uint32_t w_in_stride,
-                           const uint32_t k_in_stride,
-                           const uint32_t w_out_stride,
-                           const uint32_t k_out_stride) {
+                              const uint32_t w_in_stride,
+                              const uint32_t k_in_stride,
+                              const uint32_t w_out_stride,
+                              const uint32_t k_out_stride) {
   const uint32_t num_k_in = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT);
 
   const neureka_stride_t input_stride = {
       .d0 = k_in_stride,
       .d1 = k_in_stride * w_in_stride,
-      .d2 = task->depthwise ? 0 : 
-            task->kernel_shape == 1 ? k_in_stride * 3 * 3 : // TODO: Check this magic
-              k_in_stride * NEUREKA_FILTER_BUFFER_SIZE * NEUREKA_FILTER_BUFFER_SIZE};
+      .d2 = task->depthwise           ? 0
+            : task->kernel_shape == 1 ? k_in_stride * 3 * 3
+                                      : // TODO: Check this magic
+                k_in_stride * NEUREKA_FILTER_BUFFER_SIZE *
+                    NEUREKA_FILTER_BUFFER_SIZE};
   task->data.cfg.input_stride = input_stride;
 
   // WARNING: Stride works only for even output channel sizes (divisible by 2)
@@ -140,8 +147,8 @@ void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
     task->data.cfg.weights_stride.d2 = 0;
   } else if (!task->depthwise) {
     task->data.cfg.weights_stride.d0 = task->weight_d0_stride;
-    task->data.cfg.weights_stride.d1 = task->weight_d0_stride * task->qw *
-                                       num_k_in;
+    task->data.cfg.weights_stride.d1 =
+        task->weight_d0_stride * task->qw * num_k_in;
     task->data.cfg.weights_stride.d2 = 0;
   } else {
     task->data.cfg.weights_stride.d0 =
@@ -152,9 +159,10 @@ void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
 }
 
 void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
-                            const uint32_t h_out, const uint32_t w_out,
-                            const uint32_t k_out, const uint8_t padding_bottom,
-                            const uint8_t padding_right) {
+                               const uint32_t h_out, const uint32_t w_out,
+                               const uint32_t k_out,
+                               const uint8_t padding_bottom,
+                               const uint8_t padding_right) {
   const uint16_t num_Ko = divnceil(k_out, task->output_channel_throughput);
   const uint16_t num_Ki = divnceil(k_in, task->input_channel_throughput);
   const uint16_t num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE);
@@ -164,10 +172,10 @@ void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
   const uint16_t rem_Ki = remainder(k_in, task->input_channel_throughput);
   const uint16_t rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE);
   const uint16_t rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE);
-  const uint16_t rem_Hi =
-      (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom; // TODO: Check padding bottom
-  const uint16_t rem_Wi =
-      (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right; // TODO: Check padding right
+  const uint16_t rem_Hi = (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) -
+                          padding_bottom; // TODO: Check padding bottom
+  const uint16_t rem_Wi = (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) -
+                          padding_right; // TODO: Check padding right
 
   const neureka_subtile_t subtile = {
       .number = {.KoKi = concat_half(num_Ko, num_Ki),
@@ -179,34 +187,34 @@ void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
 }
 
 inline void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
-                                  const uint8_t bottom, const uint8_t left,
-                                  const uint8_t right, const uint8_t value) {
+                                     const uint8_t bottom, const uint8_t left,
+                                     const uint8_t right, const uint8_t value) {
   task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) |
                            ((bottom & 0xf) << 20) | ((left & 0xf) << 16) |
                            (value & 0xff);
 }
 
-inline void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
-                                      const uint8_t right, const uint8_t bottom,
-                                      const uint8_t left) {
+inline void neureka_task_set_mask_filter(neureka_task_t *task,
+                                         const uint8_t top, const uint8_t right,
+                                         const uint8_t bottom,
+                                         const uint8_t left) {
   task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
                                ((bottom & 0xff) << 8) | ((left & 0xff) << 0);
 }
 
-void neureka_task_set_dims(neureka_task_t *task, const uint32_t w_in,
-                        const uint32_t k_in, const uint32_t w_in_stride,
-                        const uint32_t k_in_stride, const uint32_t h_out,
-                        const uint32_t w_out, const uint32_t k_out,
-                        const uint32_t w_out_stride, const uint32_t k_out_stride,
-                        const uint8_t padding_top, const uint8_t padding_bottom,
-                        const uint8_t padding_right,
-                        const uint8_t padding_left) {
+void neureka_task_set_dims(
+    neureka_task_t *task, const uint32_t w_in, const uint32_t k_in,
+    const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint8_t padding_top, const uint8_t padding_bottom,
+    const uint8_t padding_right, const uint8_t padding_left) {
   neureka_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
-                        k_out_stride);
+                           k_out_stride);
   neureka_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,
-                         padding_right);
+                            padding_right);
   neureka_task_set_padding(task, padding_top, padding_bottom, padding_left,
-                        padding_right, 0);
+                           padding_right, 0);
 }
 
 void neureka_task_set_dims_stride2x2(
@@ -220,9 +228,10 @@ void neureka_task_set_dims_stride2x2(
   const uint8_t stride = 2;
 
   neureka_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
-                        k_out_stride);
+                           k_out_stride);
   neureka_task_set_counters(task, k_in, h_out > 1 ? 3 : 1, w_out > 1 ? 3 : 1,
-                         k_out, h_in + padding_top >= 5 ? 0 : padding_bottom, 0);
+                            k_out, h_in + padding_top >= 5 ? 0 : padding_bottom,
+                            0);
 
   const uint8_t padding_bottom_new =
       (h_in + padding_top - h_ker) % stride == 0 ? 0 : padding_bottom;
@@ -230,5 +239,5 @@ void neureka_task_set_dims_stride2x2(
       (w_in + padding_left - w_ker) % stride == 0 ? 0 : padding_right;
 
   neureka_task_set_padding(task, padding_top, padding_bottom_new, padding_left,
-                        padding_right_new, 0);
+                           padding_right_new, 0);
 }
diff --git a/neureka/hal/neureka_task.h b/neureka/hal/neureka_task.h
index 7f4c31b..70b80e5 100644
--- a/neureka/hal/neureka_task.h
+++ b/neureka/hal/neureka_task.h
@@ -122,45 +122,47 @@ typedef struct neureka_task_t {
 } neureka_task_t;
 
 void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
-                    const uint8_t depthwise, const uint8_t input_bits,
-                    const uint8_t output_bits, const uint8_t weights_bits,
-                    const neureka_weight_offset_mode_e weights_offset_mode,
-                    const uint32_t weights_offset_factor, neureka_quant_t quant,
-                    neureka_norm_t norm, const uint8_t stride);
+                       const uint8_t depthwise, const uint8_t input_bits,
+                       const uint8_t output_bits, const uint8_t weights_bits,
+                       const neureka_weight_offset_mode_e weights_offset_mode,
+                       const uint32_t weights_offset_factor,
+                       neureka_quant_t quant, neureka_norm_t norm,
+                       const uint8_t stride);
 uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
-                               uint32_t i_width, uint32_t n_height,
-                               uint32_t n_width);
+                                  uint32_t i_width, uint32_t n_height,
+                                  uint32_t n_width);
 uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
-                      const uint32_t channel, const uint8_t bits,
-                      const uint8_t padding_top, const uint8_t padding_left);
-void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr, uint32_t w_in,
-                        uint32_t k_in, uint8_t bits_in, uint8_t padding_top,
-                        uint8_t padding_left, uint32_t output_ptr,
-                        uint32_t weights_ptr, uint32_t scale_ptr,
-                        uint32_t shift_ptr, uint32_t bias_ptr);
+                         const uint32_t channel, const uint8_t bits,
+                         const uint8_t padding_top, const uint8_t padding_left);
+void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
+                           uint32_t w_in, uint32_t k_in, uint8_t bits_in,
+                           uint8_t padding_top, uint8_t padding_left,
+                           uint32_t output_ptr, uint32_t weights_ptr,
+                           uint32_t scale_ptr, uint32_t shift_ptr,
+                           uint32_t bias_ptr);
 void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
-                           const uint32_t w_in_stride,
-                           const uint32_t k_in_stride,
-                           const uint32_t w_out_stride,
-                           const uint32_t k_out_stride);
+                              const uint32_t w_in_stride,
+                              const uint32_t k_in_stride,
+                              const uint32_t w_out_stride,
+                              const uint32_t k_out_stride);
 void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
-                            const uint32_t h_out, const uint32_t w_out,
-                            const uint32_t k_out, const uint8_t padding_bottom,
-                            const uint8_t padding_right);
+                               const uint32_t h_out, const uint32_t w_out,
+                               const uint32_t k_out,
+                               const uint8_t padding_bottom,
+                               const uint8_t padding_right);
 void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
-                           const uint8_t bottom, const uint8_t left,
-                           const uint8_t right, const uint8_t value);
+                              const uint8_t bottom, const uint8_t left,
+                              const uint8_t right, const uint8_t value);
 void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
-                               const uint8_t right, const uint8_t bottom,
-                               const uint8_t left);
-void neureka_task_set_dims(neureka_task_t *task, const uint32_t w_in,
-                        const uint32_t k_in, const uint32_t w_in_stride,
-                        const uint32_t k_in_stride, const uint32_t h_out,
-                        const uint32_t w_out, const uint32_t k_out,
-                        const uint32_t w_out_stride, const uint32_t k_out_stride,
-                        const uint8_t padding_top, const uint8_t padding_bottom,
-                        const uint8_t padding_right,
-                        const uint8_t padding_left);
+                                  const uint8_t right, const uint8_t bottom,
+                                  const uint8_t left);
+void neureka_task_set_dims(
+    neureka_task_t *task, const uint32_t w_in, const uint32_t k_in,
+    const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint8_t padding_top, const uint8_t padding_bottom,
+    const uint8_t padding_right, const uint8_t padding_left);
 void neureka_task_set_dims_stride2x2(
     neureka_task_t *task, const uint32_t h_in, const uint32_t w_in,
     const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
diff --git a/neureka/hal/neureka_task_defs.h b/neureka/hal/neureka_task_defs.h
index daa9897..df9635d 100644
--- a/neureka/hal/neureka_task_defs.h
+++ b/neureka/hal/neureka_task_defs.h
@@ -89,7 +89,8 @@
 #define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10)
 #define NEUREKA_FLAG_USE_WMEM (1 << 9)
 #define NEUREKA_FLAG_USE_TCDM (0 << 9)
-#define NEUREKA_FLAG_STRIDE_2x2 (1 << 8) // TODO: Check if the `STRIDED` mode is still `STRIDE_2x2`
+#define NEUREKA_FLAG_STRIDE_2x2                                                \
+  (1 << 8) // TODO: Check if the `STRIDED` mode is still `STRIDE_2x2`
 #define NEUREKA_FLAG_LINEAR_MODE (1 << 7)
 #define NEUREKA_FLAG_MODE_3x3 (0 << 5)
 #define NEUREKA_FLAG_MODE_3x3_DW (1 << 5)
diff --git a/src/pulp_nnx_ne16.c b/src/pulp_nnx_ne16.c
index 7ab0e99..6417b07 100644
--- a/src/pulp_nnx_ne16.c
+++ b/src/pulp_nnx_ne16.c
@@ -88,8 +88,8 @@ static inline uint32_t _get_tile_ptr(uint32_t ptr, int i, int j, int size_i,
 }
 
 void ne16_nnx_dispatch_stride2x2(
-    ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in, const uint32_t k_in,
-    const uint32_t w_in_stride, const uint32_t k_in_stride,
+    ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in,
+    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
     const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
     const uint32_t w_out_stride, const uint32_t k_out_stride,
     const uint8_t h_ker, const uint8_t w_ker) {
diff --git a/src/pulp_nnx_neureka.c b/src/pulp_nnx_neureka.c
index 440ec07..c7e2c64 100644
--- a/src/pulp_nnx_neureka.c
+++ b/src/pulp_nnx_neureka.c
@@ -88,8 +88,8 @@ static inline uint32_t _get_tile_ptr(uint32_t ptr, int i, int j, int size_i,
 }
 
 void neureka_nnx_dispatch_stride2x2(
-    neureka_dev_t *dev, neureka_task_t *task, const uint32_t w_in, const uint32_t k_in,
-    const uint32_t w_in_stride, const uint32_t k_in_stride,
+    neureka_dev_t *dev, neureka_task_t *task, const uint32_t w_in,
+    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
     const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
     const uint32_t w_out_stride, const uint32_t k_out_stride,
     const uint8_t h_ker, const uint8_t w_ker) {
diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index 414f0a3..0c43db9 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -105,17 +105,17 @@ typedef neureka_siracusa_conf_t nnx_bsp_conf_t;
 
 static void task_prepare(nnx_task_t *task) {
   nnx_task_init(task, WEIGHT_HEIGHT, GROUPS > 1, INPUT_BITS, OUTPUT_BITS,
-                 WEIGHT_BITS, weightOffsetModeLayerWise, WEIGHT_OFFSET,
-                 (neureka_quant_t){.shift_amount = OUTSHIFT,
-                                .mode = quantMode8Bit,
-                                .function = HAS_RELU ? quantFunctionRelu
-                                                     : quantFunctionIdentity,
-                                .flag_rounding = nnxTaskFlagFalse},
-                 (neureka_norm_t){.mode = normMode8Bit,
-                               .flag_bias = HAS_BIAS ? nnxTaskFlagTrue
-                                                     : nnxTaskFlagFalse,
-                               .flag_shift = nnxTaskFlagFalse},
-                 STRIDE_HEIGHT);
+                WEIGHT_BITS, weightOffsetModeLayerWise, WEIGHT_OFFSET,
+                (neureka_quant_t){.shift_amount = OUTSHIFT,
+                                  .mode = quantMode8Bit,
+                                  .function = HAS_RELU ? quantFunctionRelu
+                                                       : quantFunctionIdentity,
+                                  .flag_rounding = nnxTaskFlagFalse},
+                (neureka_norm_t){.mode = normMode8Bit,
+                                 .flag_bias = HAS_BIAS ? nnxTaskFlagTrue
+                                                       : nnxTaskFlagFalse,
+                                 .flag_shift = nnxTaskFlagFalse},
+                STRIDE_HEIGHT);
 
   if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) {
     nnx_task_set_dims_stride2x2(
@@ -125,18 +125,18 @@ static void task_prepare(nnx_task_t *task) {
         PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT);
   } else {
     nnx_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
-                       INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
-                       OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, PADDING_TOP,
-                       PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT);
+                      INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
+                      OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, PADDING_TOP,
+                      PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT);
   }
 
   nnx_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, INPUT_CHANNEL,
-                     INPUT_BITS, PADDING_TOP, PADDING_LEFT, (uint32_t)output,
-                     (uint32_t)weight, (uint32_t)scale, NULL,
+                    INPUT_BITS, PADDING_TOP, PADDING_LEFT, (uint32_t)output,
+                    (uint32_t)weight, (uint32_t)scale, NULL,
 #if HAS_BIAS == 1
-                     (uint32_t)bias
+                    (uint32_t)bias
 #else
-                     NULL
+                    NULL
 #endif
   );
 }
@@ -145,7 +145,7 @@ static void task_execute(nnx_task_t *task) {
   nnx_dev_t *dev = nnx_bsp_get_dev();
 
   nnx_gvsoc_log_activate(dev, NNX_GVSOC_LOG_LEVEL_CONFIG,
-                          NNX_GVSOC_LOG_FORMAT_HEXADECIMAL);
+                         NNX_GVSOC_LOG_FORMAT_HEXADECIMAL);
 
   nnx_bsp_conf_t conf = {.max_stall = 8};
   nnx_init(dev, &conf);
@@ -154,9 +154,9 @@ static void task_execute(nnx_task_t *task) {
 
   if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) {
     nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
-                                INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
-                                OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL,
-                                WEIGHT_HEIGHT, WEIGHT_WIDTH);
+                           INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
+                           OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL,
+                           WEIGHT_HEIGHT, WEIGHT_WIDTH);
   } else {
     nnx_dispatch(dev, task);
   }

From a114acd160cb04974c77f343bdaf777ab333fbc5 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 16 Jan 2024 16:19:41 +0100
Subject: [PATCH 05/72] Fix strides and counters, remove stride2x2 and flag16
 mode

---
 neureka/hal/neureka_task.c      | 94 +++++++++------------------------
 neureka/hal/neureka_task_defs.h | 27 +++++-----
 2 files changed, 38 insertions(+), 83 deletions(-)

diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
index 9c5f30c..9aeaa76 100644
--- a/neureka/hal/neureka_task.c
+++ b/neureka/hal/neureka_task.c
@@ -48,15 +48,8 @@ void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
                        const uint32_t weights_offset_factor,
                        neureka_quant_t quant, neureka_norm_t norm,
                        const uint8_t stride) {
-  const uint32_t flag_mode16 =
-      input_bits == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC;
-
   *task = (neureka_task_t){
       .outbytes = output_bits / 8,
-      .weight_d0_stride = flag_mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16
-                          : kernel_shape == 3
-                              ? NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3
-                              : NEUREKA_WEIGHT_D0_STRIDE_MODE8,
       .qw = weights_bits,
       .stride_shift = stride == 2 ? 1 : 0,
       .output_channel_throughput = depthwise
@@ -64,13 +57,11 @@ void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
                                        : NEUREKA_OUTPUT_CHANNEL_THROUGHPUT,
       .input_channel_throughput = kernel_shape == 3
                                       ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3
-                                      : NEUREKA_INPUT_CHANNEL_THROUGHPUT,
+                                      : NEUREKA_INPUT_CHANNEL_THROUGHPUT_1x1,
       .kernel_shape = kernel_shape,
       .depthwise = depthwise,
       .data = {0}};
 
-  const int flag_stride2x2 = stride == 2 ? NEUREKA_FLAG_STRIDE_2x2 : 0;
-
   const int flag_mode = kernel_shape == 1 ? NEUREKA_FLAG_MODE_1x1
                         : depthwise == 1  ? NEUREKA_FLAG_MODE_3x3_DW
                                           : NEUREKA_FLAG_MODE_3x3;
@@ -81,8 +72,7 @@ void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
       quant.flag_rounding << NEUREKA_SHIFT_ROUNDING | norm.mode |
       norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS |
       norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT | NEUREKA_FLAG_USE_TCDM |
-      weights_offset_mode | flag_mode | flag_mode16 | (weights_bits - 1) |
-      flag_stride2x2;
+      weights_offset_mode | flag_mode | (weights_bits - 1);
 
   task->data.cfg.weight_offset_factor = weights_offset_factor;
 }
@@ -120,41 +110,32 @@ void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
                               const uint32_t k_in_stride,
                               const uint32_t w_out_stride,
                               const uint32_t k_out_stride) {
-  const uint32_t num_k_in = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT);
+  const uint32_t num_k_in = divnceil(k_in, task->input_channel_throughput);
 
   const neureka_stride_t input_stride = {
       .d0 = k_in_stride,
       .d1 = k_in_stride * w_in_stride,
-      .d2 = task->depthwise           ? 0
-            : task->kernel_shape == 1 ? k_in_stride * 3 * 3
-                                      : // TODO: Check this magic
-                k_in_stride * NEUREKA_FILTER_BUFFER_SIZE *
-                    NEUREKA_FILTER_BUFFER_SIZE};
+      .d2 = 0 // Unused
+  };
   task->data.cfg.input_stride = input_stride;
 
-  // WARNING: Stride works only for even output channel sizes (divisible by 2)
   const neureka_stride_t output_stride = {
-      .d0 = 32,
-      .d1 = (k_out_stride * task->outbytes) >> task->stride_shift,
-      .d2 =
-          (k_out_stride * task->outbytes * w_out_stride) >> task->stride_shift};
+      .d0 = 32, // TODO: should depend on outbytes. Probably 32 / outbytes
+      .d1 = k_out_stride * task->outbytes,
+      .d2 = k_out_stride * task->outbytes * w_out_stride
+  };
   task->data.cfg.output_stride = output_stride;
 
-  if (task->kernel_shape == 1) {
-    task->data.cfg.weights_stride.d0 = task->weight_d0_stride * task->qw;
+  task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3;
+  task->data.cfg.weights_stride.d2 = 0;
+  if (task->kernel_shape == 1) { // 1x1
     task->data.cfg.weights_stride.d1 =
-        task->weight_d0_stride * task->qw * num_k_in;
-    task->data.cfg.weights_stride.d2 = 0;
-  } else if (!task->depthwise) {
-    task->data.cfg.weights_stride.d0 = task->weight_d0_stride;
+        NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 * num_k_in;
+  } else if (!task->depthwise) { // 3x3
     task->data.cfg.weights_stride.d1 =
-        task->weight_d0_stride * task->qw * num_k_in;
-    task->data.cfg.weights_stride.d2 = 0;
-  } else {
-    task->data.cfg.weights_stride.d0 =
-        NEUREKA_FILTER_SIZE * NEUREKA_FILTER_SIZE * task->weight_d0_stride;
+        NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 * task->qw * num_k_in;
+  } else { // 3x3 depthwise
     task->data.cfg.weights_stride.d1 = 0;
-    task->data.cfg.weights_stride.d2 = 0;
   }
 }
 
@@ -165,16 +146,16 @@ void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
                                const uint8_t padding_right) {
   const uint16_t num_Ko = divnceil(k_out, task->output_channel_throughput);
   const uint16_t num_Ki = divnceil(k_in, task->input_channel_throughput);
-  const uint16_t num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE);
-  const uint16_t num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE);
-
-  const uint16_t rem_Ko = remainder(k_out, task->output_channel_throughput);
-  const uint16_t rem_Ki = remainder(k_in, task->input_channel_throughput);
-  const uint16_t rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE);
-  const uint16_t rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE);
-  const uint16_t rem_Hi = (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) -
+  const uint16_t num_Ho = divnceil(h_out, NEUREKA_COMPUTE_SIZE_HEIGHT);
+  const uint16_t num_Wo = divnceil(w_out, NEUREKA_COMPUTE_SIZE_WIDTH);
+
+  const uint16_t rem_Ko = k_out % task->output_channel_throughput;
+  const uint16_t rem_Ki = k_in % task->input_channel_throughput;
+  const uint16_t rem_Ho = h_out % NEUREKA_COMPUTE_SIZE_HEIGHT;
+  const uint16_t rem_Wo = w_out % NEUREKA_COMPUTE_SIZE_WIDTH;
+  const uint16_t rem_Hi = rem_Ho == 0 ? 0 : (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) -
                           padding_bottom; // TODO: Check padding bottom
-  const uint16_t rem_Wi = (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) -
+  const uint16_t rem_Wi = rem_Wo == 0 ? 0 : (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) -
                           padding_right; // TODO: Check padding right
 
   const neureka_subtile_t subtile = {
@@ -216,28 +197,3 @@ void neureka_task_set_dims(
   neureka_task_set_padding(task, padding_top, padding_bottom, padding_left,
                            padding_right, 0);
 }
-
-void neureka_task_set_dims_stride2x2(
-    neureka_task_t *task, const uint32_t h_in, const uint32_t w_in,
-    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
-    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
-    const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
-    const uint8_t padding_bottom, const uint8_t padding_right,
-    const uint8_t padding_left) {
-  const uint8_t stride = 2;
-
-  neureka_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
-                           k_out_stride);
-  neureka_task_set_counters(task, k_in, h_out > 1 ? 3 : 1, w_out > 1 ? 3 : 1,
-                            k_out, h_in + padding_top >= 5 ? 0 : padding_bottom,
-                            0);
-
-  const uint8_t padding_bottom_new =
-      (h_in + padding_top - h_ker) % stride == 0 ? 0 : padding_bottom;
-  const uint8_t padding_right_new =
-      (w_in + padding_left - w_ker) % stride == 0 ? 0 : padding_right;
-
-  neureka_task_set_padding(task, padding_top, padding_bottom_new, padding_left,
-                           padding_right_new, 0);
-}
diff --git a/neureka/hal/neureka_task_defs.h b/neureka/hal/neureka_task_defs.h
index df9635d..5de470d 100644
--- a/neureka/hal/neureka_task_defs.h
+++ b/neureka/hal/neureka_task_defs.h
@@ -23,16 +23,15 @@
 
 /* ARHITECTURE */
 
-#define NEUREKA_FILTER_SIZE (6)
-#define NEUREKA_FILTER_BUFFER_SIZE (8)
-#define NEUREKA_INPUT_CHANNEL_THROUGHPUT (32)
+#define NEUREKA_COMPUTE_SIZE_HEIGHT (6)
+#define NEUREKA_COMPUTE_SIZE_WIDTH (6)
+#define NEUREKA_INPUT_CHANNEL_THROUGHPUT_1x1 (32)
 #define NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 (28)
 #define NEUREKA_OUTPUT_CHANNEL_THROUGHPUT (32)
 #define NEUREKA_WEIGHT_BANDWIDTH (256)
 
-#define NEUREKA_WEIGHT_D0_STRIDE_MODE8 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 8)
+#define NEUREKA_WEIGHT_D0_STRIDE_MODE8 (NEUREKA_INPUT_CHANNEL_THROUGHPUT_1x1 / 8)
 #define NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 (NEUREKA_WEIGHT_BANDWIDTH / 8)
-#define NEUREKA_WEIGHT_D0_STRIDE_MODE16 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 16)
 
 /* TASK REGISTERS */
 
@@ -71,33 +70,33 @@
 
 /*  CONF0 FLAGS */
 
+#define NEUREKA_FLAG_SIGNED_ACTIVATION (1 << 26)
 #define NEUREKA_FLAG_NORM_BIAS (1 << 25)
 #define NEUREKA_FLAG_NORM_SHIFT (1 << 24)
 #define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23)
 #define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23)
 #define NEUREKA_QUANT_MODE_8BIT (0 << 21)
-#define NEUREKA_QUANT_MODE_16BIT (1 << 21)
+#define NEUREKA_QUANT_MODE_16BIT (1 << 21) // not supported
 #define NEUREKA_QUANT_MODE_32BIT (2 << 21)
 // conf0[20:16] - quantization shift amount
-#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15)
-#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15)
+#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15) // Unimplemented in gvsoc
+#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15) // Unimplemented in gvsoc
 #define NEUREKA_FLAG_STREAMIN (1 << 14)
 #define NEUREKA_NORM_MODE_8BIT (0 << 12)
-#define NEUREKA_NORM_MODE_16BIT (1 << 12)
+#define NEUREKA_NORM_MODE_16BIT (1 << 12) // not supported
 #define NEUREKA_NORM_MODE_32BIT (2 << 12)
-#define NEUREKA_FLAG_ROUND (1 << 11)
+#define NEUREKA_FLAG_ROUND (1 << 11) // not supported
 #define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10)
 #define NEUREKA_FLAG_USE_WMEM (1 << 9)
 #define NEUREKA_FLAG_USE_TCDM (0 << 9)
-#define NEUREKA_FLAG_STRIDE_2x2                                                \
-  (1 << 8) // TODO: Check if the `STRIDED` mode is still `STRIDE_2x2`
-#define NEUREKA_FLAG_LINEAR_MODE (1 << 7)
+#define NEUREKA_FLAG_STRIDE_2x2 (1 << 8) // not supported
+#define NEUREKA_FLAG_LINEAR_MODE (1 << 7) // not supported
 #define NEUREKA_FLAG_MODE_3x3 (0 << 5)
 #define NEUREKA_FLAG_MODE_3x3_DW (1 << 5)
 #define NEUREKA_FLAG_MODE_1x1 (2 << 5)
 #define NEUREKA_FLAG_NORM_QUANT (1 << 4)
 #define NEUREKA_FLAG_MODE_BASIC (0 << 3)
-#define NEUREKA_FLAG_MODE16 (1 << 3)
+#define NEUREKA_FLAG_MODE16 (1 << 3) // not supported
 
 /* Masks */
 

From 94bbe08d9362bac7a8be315a274b13e8d3d81ccf Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 16 Jan 2024 16:20:22 +0100
Subject: [PATCH 06/72] Fix uninitialized L1 data

---
 test/app/src/main.c      | 15 +++++++++++++++
 test/app/src/nnx_layer.c | 15 +++++++++------
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/test/app/src/main.c b/test/app/src/main.c
index cc67050..8f6c3ba 100644
--- a/test/app/src/main.c
+++ b/test/app/src/main.c
@@ -23,6 +23,15 @@
 #include "layer_util.h"
 #include "nnx_layer.h"
 #include "output.h"
+#include "input.h"
+#include "bias.h"
+#include "scale.h"
+#include "weight.h"
+
+#define memcpy(dst, src, size) \
+for (int i = 0; i < size; i++) { \
+  dst[i] = src[i]; \
+}
 
 int main() {
   struct pi_device cl_dev;
@@ -35,6 +44,11 @@ int main() {
   printf("\n");
   layer_info();
 
+  memcpy(input, input_l2, INPUT_SIZE);
+  memcpy(bias, bias_l2, BIAS_SIZE);
+  memcpy(scale, scale_l2, SCALE_SIZE);
+  memcpy(weight, weight_l2, WEIGHT_SIZE);
+
   pi_cluster_conf_init(&cl_conf);
   pi_open_from_conf(&cl_dev, &cl_conf);
   if (pi_cluster_open(&cl_dev)) {
@@ -49,6 +63,7 @@ int main() {
   printf("Test %s finished\n", TEST_NAME);
 
   printf("\n");
+  memcpy(output, output_l2, OUTPUT_SIZE);
   check_output();
 
   return 0;
diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index 0c43db9..921c679 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -43,8 +43,8 @@ typedef ne16_pulp_conf_t nnx_bsp_conf_t;
 #define nnx_task_set_dims_stride2x2 ne16_task_set_dims_stride2x2
 #define nnx_task_set_ptrs ne16_task_set_ptrs
 
-#define NNX_GVSOC_LOG_LEVEL_CONFIG NE16_GVSOC_LOG_LEVEL_CONFIG
-#define NNX_GVSOC_LOG_FORMAT_HEXADECIMAL NE16_GVSOC_LOG_FORMAT_HEXADECIMAL
+#define NNX_GVSOC_LOG_LEVEL NE16_GVSOC_LOG_LEVEL_CONFIG
+#define NNX_GVSOC_LOG_FORMAT NE16_GVSOC_LOG_FORMAT_HEXADECIMAL
 #define nnx_gvsoc_log_activate ne16_gvsoc_log_activate
 #define nnx_gvsoc_log_deactivate ne16_gvsoc_log_deactivate
 
@@ -79,8 +79,8 @@ typedef neureka_siracusa_conf_t nnx_bsp_conf_t;
 #define nnx_task_set_dims_stride2x2 neureka_task_set_dims_stride2x2
 #define nnx_task_set_ptrs neureka_task_set_ptrs
 
-#define NNX_GVSOC_LOG_LEVEL_CONFIG NEUREKA_GVSOC_LOG_LEVEL_CONFIG
-#define NNX_GVSOC_LOG_FORMAT_HEXADECIMAL NEUREKA_GVSOC_LOG_FORMAT_HEXADECIMAL
+#define NNX_GVSOC_LOG_LEVEL NEUREKA_GVSOC_LOG_LEVEL_ALL
+#define NNX_GVSOC_LOG_FORMAT NEUREKA_GVSOC_LOG_FORMAT_HEXADECIMAL
 #define nnx_gvsoc_log_activate neureka_gvsoc_log_activate
 #define nnx_gvsoc_log_deactivate neureka_gvsoc_log_deactivate
 
@@ -139,13 +139,16 @@ static void task_prepare(nnx_task_t *task) {
                     NULL
 #endif
   );
+
+  printf("input addr: @%p\n", input);
+  printf("task input addr: @%p\n", task->data.infeat_ptr);
 }
 
 static void task_execute(nnx_task_t *task) {
   nnx_dev_t *dev = nnx_bsp_get_dev();
 
-  nnx_gvsoc_log_activate(dev, NNX_GVSOC_LOG_LEVEL_CONFIG,
-                         NNX_GVSOC_LOG_FORMAT_HEXADECIMAL);
+  nnx_gvsoc_log_activate(dev, NNX_GVSOC_LOG_LEVEL,
+                         NNX_GVSOC_LOG_FORMAT);
 
   nnx_bsp_conf_t conf = {.max_stall = 8};
   nnx_init(dev, &conf);

From 6341d0ed4186e1a2124ed6edf4471e94cda46947 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 16 Jan 2024 17:53:02 +0100
Subject: [PATCH 07/72] Fixup nnx_quant_t nnx_norm_t

---
 test/app/src/nnx_layer.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index 921c679..696ee31 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -104,18 +104,18 @@ typedef neureka_siracusa_conf_t nnx_bsp_conf_t;
 #include "weight.h"
 
 static void task_prepare(nnx_task_t *task) {
-  nnx_task_init(task, WEIGHT_HEIGHT, GROUPS > 1, INPUT_BITS, OUTPUT_BITS,
-                WEIGHT_BITS, weightOffsetModeLayerWise, WEIGHT_OFFSET,
-                (neureka_quant_t){.shift_amount = OUTSHIFT,
-                                  .mode = quantMode8Bit,
-                                  .function = HAS_RELU ? quantFunctionRelu
-                                                       : quantFunctionIdentity,
-                                  .flag_rounding = nnxTaskFlagFalse},
-                (neureka_norm_t){.mode = normMode8Bit,
-                                 .flag_bias = HAS_BIAS ? nnxTaskFlagTrue
-                                                       : nnxTaskFlagFalse,
-                                 .flag_shift = nnxTaskFlagFalse},
-                STRIDE_HEIGHT);
+  nnx_task_init(
+      task, WEIGHT_HEIGHT, GROUPS > 1, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS,
+      weightOffsetModeLayerWise, WEIGHT_OFFSET,
+      (nnx_quant_t){.shift_amount = OUTSHIFT,
+                    .mode = quantMode8Bit,
+                    .function =
+                        HAS_RELU ? quantFunctionRelu : quantFunctionIdentity,
+                    .flag_rounding = nnxTaskFlagFalse},
+      (nnx_norm_t){.mode = normMode8Bit,
+                   .flag_bias = HAS_BIAS ? nnxTaskFlagTrue : nnxTaskFlagFalse,
+                   .flag_shift = nnxTaskFlagFalse},
+      STRIDE_HEIGHT);
 
   if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) {
     nnx_task_set_dims_stride2x2(
@@ -147,8 +147,7 @@ static void task_prepare(nnx_task_t *task) {
 static void task_execute(nnx_task_t *task) {
   nnx_dev_t *dev = nnx_bsp_get_dev();
 
-  nnx_gvsoc_log_activate(dev, NNX_GVSOC_LOG_LEVEL,
-                         NNX_GVSOC_LOG_FORMAT);
+  nnx_gvsoc_log_activate(dev, NNX_GVSOC_LOG_LEVEL, NNX_GVSOC_LOG_FORMAT);
 
   nnx_bsp_conf_t conf = {.max_stall = 8};
   nnx_init(dev, &conf);

From eddf3894772c91ee7890b30b7fb968262bfcf260 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Wed, 17 Jan 2024 11:01:42 +0100
Subject: [PATCH 08/72] Add Neureka weight roll/unroll script

---
 test/Neureka.py | 133 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 test/Neureka.py

diff --git a/test/Neureka.py b/test/Neureka.py
new file mode 100644
index 0000000..2c6dd6f
--- /dev/null
+++ b/test/Neureka.py
@@ -0,0 +1,133 @@
+# Luka Macan <luka.macan@unibo.it>
+#
+# Copyright 2023 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import numpy.typing as npt
+from TestClasses import IntegerType
+
+
+class Neureka:
+    ACCUMULATOR_TYPE = IntegerType(name="int32")
+
+    _WEIGHT_BANDWIDTH = 256
+    _CIN_SUBTILE_1x1 = 32
+    _CIN_SUBTILE_3x3 = 28
+
+    @staticmethod
+    def weight_unroll(
+        weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False
+    ) -> npt.NDArray[np.uint8]:
+        """Unroll weight into expected memory format
+
+        Expected weight shape is (cout, cin, H, W).
+        The produced memory layout depends on the weight kernel shape:
+          - 3x3: (cout, cinMajor, Bits, H x W x cinMinor_3x3 packed into Weight Bandwidth bits),
+          - 1x1: (cout, cinMajor, Bits x H x W x cinMinor_1x1 packed into Weight Bandwidth bits),
+        where cinMajor is the ceil(cin / cin subtile <mode>) and cinMinor has to be padded with 0 to cin subtile <mode>.
+        """
+        if depthwise:
+            weight = weight.transpose(1, 0, 2, 3)  # Swap cout and cin
+
+        cout, cin, height, width = weight.shape
+        cinSubtile = (
+            Neureka._CIN_SUBTILE_3x3 if height == 3 else Neureka._CIN_SUBTILE_1x1
+        )
+
+        # Pad cin to be divisible with CIN_SUBTILE
+        if cin % cinSubtile != 0:
+            cinPad = cinSubtile - cin % cinSubtile
+            weight = np.pad(
+                weight,
+                ((0, 0), (0, cinPad), (0, 0), (0, 0)),
+                "constant",
+                constant_values=0,
+            )
+
+        # Reshape into (cout, cinMajor, cinMinor, Flattened spatial, 1)
+        # The 1 at the end is required by the unpacking
+        cinMajor = int(np.ceil(cin / cinSubtile))
+        cinMinor = cinSubtile
+        weight = weight.reshape(cout, cinMajor, cinMinor, height * width, 1)
+
+        # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0]
+        # (cout, cinMajor, cinMinor, Flattened spatial, Bits)
+        weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little")
+
+        # Shuffle bits so that the final shape is:
+        # (cout, cinMajor, Bits, Flattened spatial, cinMinor)
+        weight = weight.transpose(0, 1, 4, 3, 2)
+
+        # Pack dimensions to fit into weight bandwidth
+        if height == 3 and width == 3:
+            # (cout * cinMajor * Bits, H * W * cinMinor)
+            weight = weight.reshape(-1, height * width * cinMinor)
+        elif height == 1 and width == 1:
+            # (cout * cinMajor, Bits * H * W * cinMinor)
+            weight = weight.reshape(-1, bits * height * width * cinMinor)
+
+        # Pad only the last dimension to weight bandwidth size
+        # (-1, Weight Bandwidth)
+        weight = np.pad(
+            weight,
+            ((0, 0), (0, Neureka._WEIGHT_BANDWIDTH - weight.shape[-1])),
+            "constant",
+            constant_values=0,
+        )
+
+        # Prepare for packing
+        # (-1, Weight Bandwidth Bytes, 8)
+        weightBandwidthBytes = int(np.ceil(Neureka._WEIGHT_BANDWIDTH / 8))
+        weight = np.stack(np.split(weight, weightBandwidthBytes, axis=-1), axis=-2)
+
+        # Pack bits
+        # (-1, Weight Bandwidth Bytes)
+        weight = np.packbits(weight, axis=-1, bitorder="little")
+
+        return weight.flatten()
+
+    @staticmethod
+    def weight_roll(
+        weight: npt.NDArray[np.uint8],
+        bits: int,
+        cout: int,
+        cin: int,
+        height: int,
+        width: int,
+    ) -> npt.NDArray[np.uint8]:
+        """Reverse of weight_roll"""
+        cinSubtile = (
+            Neureka._CIN_SUBTILE_3x3 if height == 3 else Neureka._CIN_SUBTILE_1x1
+        )
+        cinMajor = int(np.ceil(cin / cinSubtile))
+        cinMinor = cinSubtile
+        weightBandwidthBytes = int(np.ceil(Neureka._WEIGHT_BANDWIDTH / 8))
+
+        weight = weight.reshape(-1, weightBandwidthBytes, 1)
+        weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little")
+        weight = weight.reshape(-1, Neureka._WEIGHT_BANDWIDTH)
+        if height == 3 and width == 3:
+            weight = weight[:, : height * width * cinMinor]
+        elif height == 1 and width == 1:
+            weight = weight[:, : bits * height * width * cinMinor]
+        weight = weight.reshape(cout, cinMajor, bits, height * width, cinMinor)
+        weight = weight.transpose(0, 1, 4, 3, 2)
+        weight = np.packbits(weight, axis=-1, bitorder="little")
+        weight = weight.reshape(cout, cinMajor * cinMinor, height, width)
+        weight = weight[:, :cin, :, :]
+
+        return weight

From e281f4d0eea3f951e2ff000757f36f0f9dcb93bd Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Wed, 17 Jan 2024 11:02:20 +0100
Subject: [PATCH 09/72] Fix Ne16 weight rolling was unpacking to bits instead
 of 8

---
 test/Ne16.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/Ne16.py b/test/Ne16.py
index 6de5ab5..d6abaf2 100644
--- a/test/Ne16.py
+++ b/test/Ne16.py
@@ -84,7 +84,7 @@ def weight_roll(weight: np.ndarray, bits: int, Cout: int, Cin: int, H: int, W: i
         Cin_minor_bytes = int(np.ceil(Cin_minor / 8))
 
         weight = weight.reshape(Cout, Cin_major, bits, H * W, Cin_minor_bytes, 1)
-        weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little")
+        weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little")
         weight = weight.reshape(Cout, Cin_major, bits, H * W, Cin_minor)
         weight = weight.transpose(0, 1, 4, 3, 2)
         weight = np.packbits(weight, axis=-1, bitorder="little")

From d6008b6e7272d53edfef8c20075b2f79f18d3c12 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 18 Jan 2024 08:19:43 +0100
Subject: [PATCH 10/72] Fix generated arrays initialization

---
 test/HeaderWriter.py     |  4 +++-
 test/app/src/main.c      | 12 ++++++------
 test/app/src/nnx_layer.c |  5 +----
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/test/HeaderWriter.py b/test/HeaderWriter.py
index 5abb204..5fd0968 100644
--- a/test/HeaderWriter.py
+++ b/test/HeaderWriter.py
@@ -135,6 +135,7 @@ def generate_vector_header(self, name, size, _type, init=None, golden=None):
         render = ""
         render += self.includes
         render += self.render_vector(name, "extern " + _type, size)
+        render += self.render_vector(name + "_l2", "extern " + _type, size)
 
         if golden is not None:
             render += self.render_vector("golden_" + name, "extern " + _type, size)
@@ -155,10 +156,11 @@ def generate_vector_source(self, name, size, _type, init=None, golden=None):
         render = ""
         render += f'#include "{name}.h"\n\n'
         render += self.render_vector(name, "PI_L1 " + _type, size, init=init)
+        render += self.render_vector(name + "_l2", "PI_L2 " + _type, size, init=init)
 
         if golden is not None:
             render += self.render_vector(
-                "golden_" + name, "PI_L1 " + _type, size, init=golden
+                "golden_" + name, "PI_L2 " + _type, size, init=golden
             )
             render += self.check(name)
 
diff --git a/test/app/src/main.c b/test/app/src/main.c
index 8f6c3ba..db32e3f 100644
--- a/test/app/src/main.c
+++ b/test/app/src/main.c
@@ -28,7 +28,7 @@
 #include "scale.h"
 #include "weight.h"
 
-#define memcpy(dst, src, size) \
+#define NNX_MEMCPY(dst, src, size) \
 for (int i = 0; i < size; i++) { \
   dst[i] = src[i]; \
 }
@@ -44,10 +44,10 @@ int main() {
   printf("\n");
   layer_info();
 
-  memcpy(input, input_l2, INPUT_SIZE);
-  memcpy(bias, bias_l2, BIAS_SIZE);
-  memcpy(scale, scale_l2, SCALE_SIZE);
-  memcpy(weight, weight_l2, WEIGHT_SIZE);
+  NNX_MEMCPY(input, input_l2, INPUT_SIZE);
+  NNX_MEMCPY(bias, bias_l2, BIAS_SIZE);
+  NNX_MEMCPY(scale, scale_l2, SCALE_SIZE);
+  NNX_MEMCPY(weight, weight_l2, WEIGHT_SIZE);
 
   pi_cluster_conf_init(&cl_conf);
   pi_open_from_conf(&cl_dev, &cl_conf);
@@ -63,7 +63,7 @@ int main() {
   printf("Test %s finished\n", TEST_NAME);
 
   printf("\n");
-  memcpy(output, output_l2, OUTPUT_SIZE);
+  NNX_MEMCPY(output_l2, output, OUTPUT_SIZE);
   check_output();
 
   return 0;
diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index 696ee31..15ff359 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -43,7 +43,7 @@ typedef ne16_pulp_conf_t nnx_bsp_conf_t;
 #define nnx_task_set_dims_stride2x2 ne16_task_set_dims_stride2x2
 #define nnx_task_set_ptrs ne16_task_set_ptrs
 
-#define NNX_GVSOC_LOG_LEVEL NE16_GVSOC_LOG_LEVEL_CONFIG
+#define NNX_GVSOC_LOG_LEVEL NE16_GVSOC_LOG_LEVEL_ALL
 #define NNX_GVSOC_LOG_FORMAT NE16_GVSOC_LOG_FORMAT_HEXADECIMAL
 #define nnx_gvsoc_log_activate ne16_gvsoc_log_activate
 #define nnx_gvsoc_log_deactivate ne16_gvsoc_log_deactivate
@@ -139,9 +139,6 @@ static void task_prepare(nnx_task_t *task) {
                     NULL
 #endif
   );
-
-  printf("input addr: @%p\n", input);
-  printf("task input addr: @%p\n", task->data.infeat_ptr);
 }
 
 static void task_execute(nnx_task_t *task) {

From 56afb5f61c11887145f0a6172c5b2439d2989cea Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 18 Jan 2024 10:30:34 +0100
Subject: [PATCH 11/72] Fix Neureka weight unroll for 1x1 mode

---
 test/Neureka.py | 64 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 21 deletions(-)

diff --git a/test/Neureka.py b/test/Neureka.py
index 2c6dd6f..d844234 100644
--- a/test/Neureka.py
+++ b/test/Neureka.py
@@ -1,4 +1,5 @@
 # Luka Macan <luka.macan@unibo.it>
+# Arpan Suravi Prasad <prasadar@iis.ee.ethz.ch>
 #
 # Copyright 2023 ETH Zurich and University of Bologna
 #
@@ -61,33 +62,49 @@ def weight_unroll(
         # Reshape into (cout, cinMajor, cinMinor, Flattened spatial, 1)
         # The 1 at the end is required by the unpacking
         cinMajor = int(np.ceil(cin / cinSubtile))
-        cinMinor = cinSubtile
-        weight = weight.reshape(cout, cinMajor, cinMinor, height * width, 1)
+        weight = weight.reshape(cout, cinMajor, cinSubtile, height * width, 1)
 
         # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0]
-        # (cout, cinMajor, cinMinor, Flattened spatial, Bits)
+        # (cout, cinMajor, cinSubtile, Flattened spatial, Bits)
         weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little")
 
         # Shuffle bits so that the final shape is:
-        # (cout, cinMajor, Bits, Flattened spatial, cinMinor)
+        # (cout, cinMajor, Bits, Flattened spatial, cinSubtile)
         weight = weight.transpose(0, 1, 4, 3, 2)
 
         # Pack dimensions to fit into weight bandwidth
         if height == 3 and width == 3:
-            # (cout * cinMajor * Bits, H * W * cinMinor)
-            weight = weight.reshape(-1, height * width * cinMinor)
+            # (cout * cinMajor * Bits, H * W * cinSubtile)
+            weight = weight.reshape(-1, height * width * cinSubtile)
+            # Pad only the last dimension to weight bandwidth size
+            # (-1, Weight Bandwidth)
+            weight = np.pad(
+                weight,
+                ((0, 0), (0, Neureka._WEIGHT_BANDWIDTH - weight.shape[-1])),
+                "constant",
+                constant_values=0,
+            )
         elif height == 1 and width == 1:
-            # (cout * cinMajor, Bits * H * W * cinMinor)
-            weight = weight.reshape(-1, bits * height * width * cinMinor)
-
-        # Pad only the last dimension to weight bandwidth size
-        # (-1, Weight Bandwidth)
-        weight = np.pad(
-            weight,
-            ((0, 0), (0, Neureka._WEIGHT_BANDWIDTH - weight.shape[-1])),
-            "constant",
-            constant_values=0,
-        )
+            # Tile cinSubtile into tiles of size 4
+            # (cout, cinMajor, Bits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
+            weight = weight.reshape(
+                cout, cinMajor, bits, height * width, cinSubtile // 4, 4
+            )  # cout, cinMajor, bits, 1, 8, 4
+            # Pad bits to 8
+            if bits < 8:
+                # (cout, cinMajor, PaddedBits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
+                weight = np.pad(
+                    weight,
+                    ((0, 0), (0, 0), (0, 8 - bits), (0, 0), (0, 0), (0, 0)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            # (cout, cinMajor, Flattened spatial, cinSubtileMajor, PaddedBits, cinSubtileTile)
+            weight = weight.transpose(0, 1, 3, 4, 2, 5)
+            # (-1, Weight Bandwidth)
+            weight = weight.reshape(
+                cout * cinMajor, Neureka._WEIGHT_BANDWIDTH
+            )  # cout*cinMajor, 256b
 
         # Prepare for packing
         # (-1, Weight Bandwidth Bytes, 8)
@@ -109,7 +126,7 @@ def weight_roll(
         height: int,
         width: int,
     ) -> npt.NDArray[np.uint8]:
-        """Reverse of weight_roll"""
+        """Reverse of weight_unroll"""
         cinSubtile = (
             Neureka._CIN_SUBTILE_3x3 if height == 3 else Neureka._CIN_SUBTILE_1x1
         )
@@ -120,12 +137,17 @@ def weight_roll(
         weight = weight.reshape(-1, weightBandwidthBytes, 1)
         weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little")
         weight = weight.reshape(-1, Neureka._WEIGHT_BANDWIDTH)
+
         if height == 3 and width == 3:
             weight = weight[:, : height * width * cinMinor]
+            weight = weight.reshape(
+                cout, cinMajor, bits, height * width, cinMinor
+            ).transpose(0, 1, 4, 3, 2)
         elif height == 1 and width == 1:
-            weight = weight[:, : bits * height * width * cinMinor]
-        weight = weight.reshape(cout, cinMajor, bits, height * width, cinMinor)
-        weight = weight.transpose(0, 1, 4, 3, 2)
+            weight = weight[:, : height * width * cinMinor * 8]
+            weight = weight.reshape(cout, cinMajor, cinMinor // 4, 8, 4).transpose(
+                0, 1, 2, 4, 3
+            )
         weight = np.packbits(weight, axis=-1, bitorder="little")
         weight = weight.reshape(cout, cinMajor * cinMinor, height, width)
         weight = weight[:, :cin, :, :]

From 925aa417c65933437bef7855beff70101367a4bb Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 18 Jan 2024 10:52:52 +0100
Subject: [PATCH 12/72] Fix Arpan's name in the contributors

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index be8c9be..ea4c6a8 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,7 @@ The library will follow the [Semantic Versioning](https://semver.org/).
 
 * Luka Macan <[luka.macan@unibo.it](mailto:luka.macan@unibo.it)>
 * Francesco Conti <[fconti@unibo.it](mailto:fconti@unibo.it)>
-* Arpan Prasad <[prasadar@iis.ee.ethz.ch](mailto:prasadar@iis.ee.ethz.ch)>
+* Arpan Suravi Prasad <[prasadar@iis.ee.ethz.ch](mailto:prasadar@iis.ee.ethz.ch)>
 
 ## License
 

From 01406650faeaea28eedaa4fade52516a3fee7e5f Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 18 Jan 2024 13:13:47 +0100
Subject: [PATCH 13/72] Remove WIEGHT_D0_STRIDE_MODE_1x1

---
 neureka/hal/neureka_task.c      | 18 ++++++++----------
 neureka/hal/neureka_task_defs.h |  4 +---
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
index 9aeaa76..941fcde 100644
--- a/neureka/hal/neureka_task.c
+++ b/neureka/hal/neureka_task.c
@@ -126,17 +126,15 @@ void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
   };
   task->data.cfg.output_stride = output_stride;
 
-  task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3;
-  task->data.cfg.weights_stride.d2 = 0;
+  task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_D0_STRIDE;
   if (task->kernel_shape == 1) { // 1x1
-    task->data.cfg.weights_stride.d1 =
-        NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 * num_k_in;
+    task->data.cfg.weights_stride.d1 = NEUREKA_WEIGHT_D0_STRIDE * num_k_in;
   } else if (!task->depthwise) { // 3x3
-    task->data.cfg.weights_stride.d1 =
-        NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 * task->qw * num_k_in;
+    task->data.cfg.weights_stride.d1 = NEUREKA_WEIGHT_D0_STRIDE * task->qw * num_k_in;
   } else { // 3x3 depthwise
     task->data.cfg.weights_stride.d1 = 0;
   }
+  task->data.cfg.weights_stride.d2 = 0;
 }
 
 void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
@@ -149,10 +147,10 @@ void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
   const uint16_t num_Ho = divnceil(h_out, NEUREKA_COMPUTE_SIZE_HEIGHT);
   const uint16_t num_Wo = divnceil(w_out, NEUREKA_COMPUTE_SIZE_WIDTH);
 
-  const uint16_t rem_Ko = k_out % task->output_channel_throughput;
-  const uint16_t rem_Ki = k_in % task->input_channel_throughput;
-  const uint16_t rem_Ho = h_out % NEUREKA_COMPUTE_SIZE_HEIGHT;
-  const uint16_t rem_Wo = w_out % NEUREKA_COMPUTE_SIZE_WIDTH;
+  const uint16_t rem_Ko = remainder(k_out, task->output_channel_throughput);
+  const uint16_t rem_Ki = remainder(k_in, task->input_channel_throughput);
+  const uint16_t rem_Ho = remainder(h_out, NEUREKA_COMPUTE_SIZE_HEIGHT);
+  const uint16_t rem_Wo = remainder(w_out, NEUREKA_COMPUTE_SIZE_WIDTH);
   const uint16_t rem_Hi = rem_Ho == 0 ? 0 : (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) -
                           padding_bottom; // TODO: Check padding bottom
   const uint16_t rem_Wi = rem_Wo == 0 ? 0 : (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) -
diff --git a/neureka/hal/neureka_task_defs.h b/neureka/hal/neureka_task_defs.h
index 5de470d..f720061 100644
--- a/neureka/hal/neureka_task_defs.h
+++ b/neureka/hal/neureka_task_defs.h
@@ -29,9 +29,7 @@
 #define NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 (28)
 #define NEUREKA_OUTPUT_CHANNEL_THROUGHPUT (32)
 #define NEUREKA_WEIGHT_BANDWIDTH (256)
-
-#define NEUREKA_WEIGHT_D0_STRIDE_MODE8 (NEUREKA_INPUT_CHANNEL_THROUGHPUT_1x1 / 8)
-#define NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 (NEUREKA_WEIGHT_BANDWIDTH / 8)
+#define NEUREKA_WEIGHT_D0_STRIDE (NEUREKA_WEIGHT_BANDWIDTH / 8)
 
 /* TASK REGISTERS */
 

From 741b5d7cd0d91ceaaa3b161bd781510c0259a963 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 18 Jan 2024 13:14:08 +0100
Subject: [PATCH 14/72] Add multi-accelerator support and neureka as a target

---
 test/Ne16TestConf.py                          | 140 +++++++++++
 test/NeurekaTestConf.py                       | 140 +++++++++++
 .../{Ne16TestClasses.py => NnxTestClasses.py} | 226 +++++-------------
 test/conf.toml                                |   2 +-
 test/conftest.py                              |  37 ++-
 test/test.py                                  |  42 +++-
 test/testgen.py                               | 132 ++++++----
 7 files changed, 489 insertions(+), 230 deletions(-)
 create mode 100644 test/Ne16TestConf.py
 create mode 100644 test/NeurekaTestConf.py
 rename test/{Ne16TestClasses.py => NnxTestClasses.py} (60%)

diff --git a/test/Ne16TestConf.py b/test/Ne16TestConf.py
new file mode 100644
index 0000000..889a1fe
--- /dev/null
+++ b/test/Ne16TestConf.py
@@ -0,0 +1,140 @@
+# Luka Macan <luka.macan@unibo.it>
+#
+# Copyright 2023 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+from typing import List, Union, Optional
+from Ne16 import Ne16
+from NnxTestClasses import NnxTestConf
+from TestClasses import implies, KernelShape, Padding, Stride, IntegerType
+from pydantic import field_validator, model_validator
+
+
+class Ne16TestConf(NnxTestConf):
+    @field_validator("kernel_shape")
+    @classmethod
+    def check_valid_kernel_shape(cls, v: KernelShape) -> KernelShape:
+        assert v == KernelShape(height=1, width=1) or v == KernelShape(
+            height=3, width=3
+        ), f"Unsupported kernel shape {v}. Supported 1x1 and 3x3."
+        return v
+
+    @field_validator("stride")
+    @classmethod
+    def check_valid_stride(cls, v: Stride) -> Stride:
+        assert v == Stride(height=1, width=1) or v == Stride(
+            height=2, width=2
+        ), f"Unsupported stride {v}. Supported 1x1 and 2x2."
+        return v
+
+    @staticmethod
+    def _check_type(
+        name: str, _type: IntegerType, allowed_types: List[Union[IntegerType, str]]
+    ) -> None:
+        assert (
+            _type in allowed_types
+        ), f"Unsupported {name} type {_type}. Supported types: {allowed_types}"
+
+    @field_validator("in_type")
+    @classmethod
+    def check_valid_in_type(cls, v: IntegerType) -> IntegerType:
+        Ne16TestConf._check_type("in_type", v, ["uint8"])
+        return v
+
+    @field_validator("out_type")
+    @classmethod
+    def check_valid_out_type(cls, v: IntegerType) -> IntegerType:
+        Ne16TestConf._check_type("out_type", v, ["uint8", "int8"])
+        return v
+
+    @field_validator("weight_type")
+    @classmethod
+    def check_valid_weight_type(cls, v: IntegerType) -> IntegerType:
+        Ne16TestConf._check_type("weight_type", v, ["int8"])
+        return v
+
+    @field_validator("scale_type")
+    @classmethod
+    def check_valid_scale_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
+        if v is not None:
+            Ne16TestConf._check_type("scale_type", v, ["uint8", "uint32"])
+        return v
+
+    @field_validator("bias_type")
+    @classmethod
+    def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
+        if v is not None:
+            Ne16TestConf._check_type("bias_type", v, ["int32"])
+        return v
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_out_channel_with_stride_2x2(self) -> Ne16TestConf:
+        assert implies(
+            self.stride == Stride(height=2, width=2), self.out_channel % 2 == 0
+        ), f"With stride 2x2 supported only even output channel sizes. Given output channel {self.out_channel}"
+        return self
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_depthwise(self) -> Ne16TestConf:
+        assert implies(
+            self.depthwise, self.kernel_shape == KernelShape(height=3, width=3)
+        ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}."
+        assert implies(self.depthwise, self.in_channel == self.out_channel), (
+            f"Input and output channel should be the same in a depthwise layer. "
+            f"input channel: {self.in_channel}, output channel: {self.out_channel}"
+        )
+        return self
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_padding_with_kernel_shape_1x1(self) -> Ne16TestConf:
+        assert implies(
+            self.kernel_shape == KernelShape(height=1, width=1),
+            self.padding == Padding(top=0, bottom=0, left=0, right=0),
+        ), f"No padding on 1x1 kernel. Given padding {self.padding}"
+        return self
+
+    @field_validator("has_norm_quant")
+    @classmethod
+    def check_valid_has_norm_quant(cls, v: bool) -> bool:
+        assert v == True, f"Untested without has_norm_quant."
+        return v
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_norm_quant_types_when_has_norm_qunat(self) -> Ne16TestConf:
+        if self.has_norm_quant:
+            assert self.scale_type is not None, "Scale type was not provided."
+            if self.has_bias:
+                assert self.bias_type is not None, "Bias type was not provided."
+        return self
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_out_type_with_flags(self) -> Ne16TestConf:
+        assert implies(
+            not self.has_norm_quant, self.out_type == Ne16.ACCUMULATOR_TYPE
+        ), (
+            f"Without quantization, the output type has to be equal to the "
+            f"accumulator type {Ne16.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
+        )
+        assert implies(
+            self.has_norm_quant,
+            (self.has_relu and not self.out_type._signed)
+            or (not self.has_relu and self.out_type._signed),
+        ), (
+            f"Output type has to be unsigned when there is relu, otherwise signed. "
+            f"Given output type {self.out_type} and has_relu {self.has_relu}"
+        )
+        return self
diff --git a/test/NeurekaTestConf.py b/test/NeurekaTestConf.py
new file mode 100644
index 0000000..dad7fc4
--- /dev/null
+++ b/test/NeurekaTestConf.py
@@ -0,0 +1,140 @@
+# Luka Macan <luka.macan@unibo.it>
+#
+# Copyright 2023 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+from Neureka import Neureka
+from typing import List, Union, Optional
+from NnxTestClasses import NnxTestConf
+from TestClasses import implies, KernelShape, Padding, Stride, IntegerType
+from pydantic import field_validator, model_validator
+
+
+class NeurekaTestConf(NnxTestConf):
+    @field_validator("kernel_shape")
+    @classmethod
+    def check_valid_kernel_shape(cls, v: KernelShape) -> KernelShape:
+        assert v == KernelShape(height=1, width=1) or v == KernelShape(
+            height=3, width=3
+        ), f"Unsupported kernel shape {v}. Supported 1x1 and 3x3."
+        return v
+
+    @field_validator("stride")
+    @classmethod
+    def check_valid_stride(cls, v: Stride) -> Stride:
+        assert v == Stride(height=1, width=1) or v == Stride(
+            height=2, width=2
+        ), f"Unsupported stride {v}. Supported 1x1 and 2x2."
+        return v
+
+    @staticmethod
+    def _check_type(
+        name: str, _type: IntegerType, allowed_types: List[Union[IntegerType, str]]
+    ) -> None:
+        assert (
+            _type in allowed_types
+        ), f"Unsupported {name} type {_type}. Supported types: {allowed_types}"
+
+    @field_validator("in_type")
+    @classmethod
+    def check_valid_in_type(cls, v: IntegerType) -> IntegerType:
+        NeurekaTestConf._check_type("in_type", v, ["uint8"])
+        return v
+
+    @field_validator("out_type")
+    @classmethod
+    def check_valid_out_type(cls, v: IntegerType) -> IntegerType:
+        NeurekaTestConf._check_type("out_type", v, ["uint8", "int8"])
+        return v
+
+    @field_validator("weight_type")
+    @classmethod
+    def check_valid_weight_type(cls, v: IntegerType) -> IntegerType:
+        NeurekaTestConf._check_type("weight_type", v, ["int8"])
+        return v
+
+    @field_validator("scale_type")
+    @classmethod
+    def check_valid_scale_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
+        if v is not None:
+            NeurekaTestConf._check_type("scale_type", v, ["uint8", "uint32"])
+        return v
+
+    @field_validator("bias_type")
+    @classmethod
+    def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
+        if v is not None:
+            NeurekaTestConf._check_type("bias_type", v, ["int32"])
+        return v
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_out_channel_with_stride_2x2(self) -> NeurekaTestConf:
+        assert implies(
+            self.stride == Stride(height=2, width=2), self.out_channel % 2 == 0
+        ), f"With stride 2x2 supported only even output channel sizes. Given output channel {self.out_channel}"
+        return self
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_depthwise(self) -> NeurekaTestConf:
+        assert implies(
+            self.depthwise, self.kernel_shape == KernelShape(height=3, width=3)
+        ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}."
+        assert implies(self.depthwise, self.in_channel == self.out_channel), (
+            f"Input and output channel should be the same in a depthwise layer. "
+            f"input channel: {self.in_channel}, output channel: {self.out_channel}"
+        )
+        return self
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_padding_with_kernel_shape_1x1(self) -> NeurekaTestConf:
+        assert implies(
+            self.kernel_shape == KernelShape(height=1, width=1),
+            self.padding == Padding(top=0, bottom=0, left=0, right=0),
+        ), f"No padding on 1x1 kernel. Given padding {self.padding}"
+        return self
+
+    @field_validator("has_norm_quant")
+    @classmethod
+    def check_valid_has_norm_quant(cls, v: bool) -> bool:
+        assert v == True, f"Untested without has_norm_quant."
+        return v
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_norm_quant_types_when_has_norm_qunat(self) -> NeurekaTestConf:
+        if self.has_norm_quant:
+            assert self.scale_type is not None, "Scale type was not provided."
+            if self.has_bias:
+                assert self.bias_type is not None, "Bias type was not provided."
+        return self
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_out_type_with_flags(self) -> NeurekaTestConf:
+        assert implies(
+            not self.has_norm_quant, self.out_type == Neureka.ACCUMULATOR_TYPE
+        ), (
+            f"Without quantization, the output type has to be equal to the "
+            f"accumulator type {Neureka.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
+        )
+        assert implies(
+            self.has_norm_quant,
+            (self.has_relu and not self.out_type._signed)
+            or (not self.has_relu and self.out_type._signed),
+        ), (
+            f"Output type has to be unsigned when there is relu, otherwise signed. "
+            f"Given output type {self.out_type} and has_relu {self.has_relu}"
+        )
+        return self
diff --git a/test/Ne16TestClasses.py b/test/NnxTestClasses.py
similarity index 60%
rename from test/Ne16TestClasses.py
rename to test/NnxTestClasses.py
index d99e829..ed1b55e 100644
--- a/test/Ne16TestClasses.py
+++ b/test/NnxTestClasses.py
@@ -17,18 +17,18 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
-from typing import List, Union, Optional, Set, Tuple
+from typing import Callable, Union, Optional, Set, Tuple, Type
 import torch
 import numpy as np
+import numpy.typing as npt
 import torch.nn.functional as F
 import os
-from Ne16 import Ne16
 from HeaderWriter import HeaderWriter
-from TestClasses import implies, KernelShape, Padding, Stride, IntegerType
-from pydantic import BaseModel, field_validator, model_validator, PositiveInt
+from TestClasses import IntegerType, Stride, Padding, KernelShape, implies
+from pydantic import BaseModel, PositiveInt
 
 
-class Ne16TestConf(BaseModel):
+class NnxTestConf(BaseModel):
     in_height: PositiveInt
     in_width: PositiveInt
     in_channel: PositiveInt
@@ -46,122 +46,8 @@ class Ne16TestConf(BaseModel):
     has_bias: bool
     has_relu: bool
 
-    @field_validator("kernel_shape")
-    @classmethod
-    def check_valid_kernel_shape(cls, v: KernelShape) -> KernelShape:
-        assert v == KernelShape(height=1, width=1) or v == KernelShape(
-            height=3, width=3
-        ), f"Unsupported kernel shape {v}. Supported 1x1 and 3x3."
-        return v
-
-    @field_validator("stride")
-    @classmethod
-    def check_valid_stride(cls, v: Stride) -> Stride:
-        assert v == Stride(height=1, width=1) or v == Stride(
-            height=2, width=2
-        ), f"Unsupported stride {v}. Supported 1x1 and 2x2."
-        return v
-
-    @staticmethod
-    def _check_type(
-        name: str, _type: IntegerType, allowed_types: List[Union[IntegerType, str]]
-    ) -> None:
-        assert (
-            _type in allowed_types
-        ), f"Unsupported {name} type {_type}. Supported types: {allowed_types}"
-
-    @field_validator("in_type")
-    @classmethod
-    def check_valid_in_type(cls, v: IntegerType) -> IntegerType:
-        Ne16TestConf._check_type("in_type", v, ["uint8"])
-        return v
-
-    @field_validator("out_type")
-    @classmethod
-    def check_valid_out_type(cls, v: IntegerType) -> IntegerType:
-        Ne16TestConf._check_type("out_type", v, ["uint8", "int8"])
-        return v
-
-    @field_validator("weight_type")
-    @classmethod
-    def check_valid_weight_type(cls, v: IntegerType) -> IntegerType:
-        Ne16TestConf._check_type("weight_type", v, ["int8"])
-        return v
-
-    @field_validator("scale_type")
-    @classmethod
-    def check_valid_scale_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
-        if v is not None:
-            Ne16TestConf._check_type("scale_type", v, ["uint8", "uint32"])
-        return v
-
-    @field_validator("bias_type")
-    @classmethod
-    def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
-        if v is not None:
-            Ne16TestConf._check_type("bias_type", v, ["int32"])
-        return v
-
-    @model_validator(mode="after")  # type: ignore
-    def check_valid_out_channel_with_stride_2x2(self) -> Ne16TestConf:
-        assert implies(
-            self.stride == Stride(height=2, width=2), self.out_channel % 2 == 0
-        ), f"With stride 2x2 supported only even output channel sizes. Given output channel {self.out_channel}"
-        return self
-
-    @model_validator(mode="after")  # type: ignore
-    def check_valid_depthwise(self) -> Ne16TestConf:
-        assert implies(
-            self.depthwise, self.kernel_shape == KernelShape(height=3, width=3)
-        ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}."
-        assert implies(self.depthwise, self.in_channel == self.out_channel), (
-            f"Input and output channel should be the same in a depthwise layer. "
-            f"input channel: {self.in_channel}, output channel: {self.out_channel}"
-        )
-        return self
 
-    @model_validator(mode="after")  # type: ignore
-    def check_valid_padding_with_kernel_shape_1x1(self) -> Ne16TestConf:
-        assert implies(
-            self.kernel_shape == KernelShape(height=1, width=1),
-            self.padding == Padding(top=0, bottom=0, left=0, right=0),
-        ), f"No padding on 1x1 kernel. Given padding {self.padding}"
-        return self
-
-    @field_validator("has_norm_quant")
-    @classmethod
-    def check_valid_has_norm_quant(cls, v: bool) -> bool:
-        assert v == True, f"Untested without has_norm_quant."
-        return v
-
-    @model_validator(mode="after")  # type: ignore
-    def check_valid_norm_quant_types_when_has_norm_qunat(self) -> Ne16TestConf:
-        if self.has_norm_quant:
-            assert self.scale_type is not None, "Scale type was not provided."
-            if self.has_bias:
-                assert self.bias_type is not None, "Bias type was not provided."
-        return self
-
-    @model_validator(mode="after")  # type: ignore
-    def check_valid_out_type_with_flags(self) -> Ne16TestConf:
-        assert implies(
-            not self.has_norm_quant, self.out_type == Ne16.ACCUMULATOR_TYPE
-        ), (
-            f"Without quantization, the output type has to be equal to the "
-            f"accumulator type {Ne16.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
-        )
-        assert implies(
-            self.has_norm_quant,
-            (self.has_relu and not self.out_type._signed)
-            or (not self.has_relu and self.out_type._signed),
-        ), (
-            f"Output type has to be unsigned when there is relu, otherwise signed. "
-            f"Given output type {self.out_type} and has_relu {self.has_relu}"
-        )
-        return self
-
-
-class Ne16Test:
+class NnxTest:
     _CONF_NAME = "conf.json"
     _INPUT_NAME = "input.pt"
     _OUTPUT_NAME = "output.pt"
@@ -172,7 +58,7 @@ class Ne16Test:
 
     def __init__(
         self,
-        conf: Ne16TestConf,
+        conf: NnxTestConf,
         input: Optional[torch.Tensor],
         output: Optional[torch.Tensor],
         weight: Optional[torch.Tensor],
@@ -188,7 +74,7 @@ def __init__(
         self.bias = bias
         self.global_shift = global_shift
 
-    def is_valid(self):
+    def is_valid(self) -> bool:
         return all(
             [
                 self.input is not None,
@@ -203,22 +89,22 @@ def is_valid(self):
     def save_conf(self, path: Union[str, os.PathLike]) -> None:
         os.makedirs(path, exist_ok=True)
 
-        with open(os.path.join(path, Ne16Test._CONF_NAME), "w") as fp:
+        with open(os.path.join(path, NnxTest._CONF_NAME), "w") as fp:
             fp.write(self.conf.model_dump_json(indent=4))
 
     def save_data(self, path: Union[str, os.PathLike]) -> None:
         os.makedirs(path, exist_ok=True)
 
-        torch.save(self.input, os.path.join(path, Ne16Test._INPUT_NAME))
-        torch.save(self.output, os.path.join(path, Ne16Test._OUTPUT_NAME))
-        torch.save(self.weight, os.path.join(path, Ne16Test._WEIGHT_NAME))
+        torch.save(self.input, os.path.join(path, NnxTest._INPUT_NAME))
+        torch.save(self.output, os.path.join(path, NnxTest._OUTPUT_NAME))
+        torch.save(self.weight, os.path.join(path, NnxTest._WEIGHT_NAME))
         if self.scale is not None:
-            torch.save(self.scale, os.path.join(path, Ne16Test._SCALE_NAME))
+            torch.save(self.scale, os.path.join(path, NnxTest._SCALE_NAME))
         if self.bias is not None:
-            torch.save(self.bias, os.path.join(path, Ne16Test._BIAS_NAME))
+            torch.save(self.bias, os.path.join(path, NnxTest._BIAS_NAME))
         if self.global_shift is not None:
             torch.save(
-                self.global_shift, os.path.join(path, Ne16Test._GLOBAL_SHIFT_NAME)
+                self.global_shift, os.path.join(path, NnxTest._GLOBAL_SHIFT_NAME)
             )
 
     def save(self, path: Union[str, os.PathLike]) -> None:
@@ -228,33 +114,33 @@ def save(self, path: Union[str, os.PathLike]) -> None:
     @staticmethod
     def is_test_dir(path: Union[str, os.PathLike]) -> bool:
         fileset = set(os.listdir(path))
-        required_fileset = set([Ne16Test._CONF_NAME])
+        required_fileset = set([NnxTest._CONF_NAME])
         return required_fileset.issubset(fileset)
 
     @classmethod
-    def load(cls, path: Union[str, os.PathLike]) -> "Ne16Test":
-        assert Ne16Test.is_test_dir(
+    def load(cls, confCls: Type[NnxTestConf], path: Union[str, os.PathLike]) -> NnxTest:
+        assert NnxTest.is_test_dir(
             path
         ), f"ERROR: Test {path} does not contain the necessary files."
 
-        with open(os.path.join(path, Ne16Test._CONF_NAME), "r") as fp:
-            conf = Ne16TestConf.model_validate_json(fp.read())
+        with open(os.path.join(path, NnxTest._CONF_NAME), "r") as fp:
+            conf = confCls.model_validate_json(fp.read())
 
         def load_if_exist(filename: str) -> Optional[torch.Tensor]:
             filepath = os.path.join(path, filename)
             return torch.load(filepath) if os.path.isfile(filepath) else None
 
-        input = load_if_exist(Ne16Test._INPUT_NAME)
-        output = load_if_exist(Ne16Test._OUTPUT_NAME)
-        weight = load_if_exist(Ne16Test._WEIGHT_NAME)
-        scale = load_if_exist(Ne16Test._SCALE_NAME)
-        bias = load_if_exist(Ne16Test._BIAS_NAME)
-        global_shift = load_if_exist(Ne16Test._GLOBAL_SHIFT_NAME)
+        input = load_if_exist(NnxTest._INPUT_NAME)
+        output = load_if_exist(NnxTest._OUTPUT_NAME)
+        weight = load_if_exist(NnxTest._WEIGHT_NAME)
+        scale = load_if_exist(NnxTest._SCALE_NAME)
+        bias = load_if_exist(NnxTest._BIAS_NAME)
+        global_shift = load_if_exist(NnxTest._GLOBAL_SHIFT_NAME)
 
         return cls(conf, input, output, weight, scale, bias, global_shift)
 
 
-class Ne16TestGenerator:
+class NnxTestGenerator:
     _DEFAULT_SEED = 0
 
     @staticmethod
@@ -286,17 +172,18 @@ def _cast(
 
     @staticmethod
     def from_conf(
-        conf: Ne16TestConf,
+        conf: NnxTestConf,
+        accumulator_type: IntegerType,
         input: Optional[torch.Tensor] = None,
         weight: Optional[torch.Tensor] = None,
         scale: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
         global_shift: Optional[torch.Tensor] = None,
-    ) -> Ne16Test:
-        torch.manual_seed(Ne16TestGenerator._DEFAULT_SEED)
+    ) -> NnxTest:
+        torch.manual_seed(NnxTestGenerator._DEFAULT_SEED)
 
         if input is None:
-            input = Ne16TestGenerator._random_data(
+            input = NnxTestGenerator._random_data(
                 _type=conf.in_type,
                 shape=(1, conf.in_channel, conf.in_height, conf.in_width),
             )
@@ -314,7 +201,7 @@ def from_conf(
         )
 
         if weight is None:
-            weight = Ne16TestGenerator._random_data(
+            weight = NnxTestGenerator._random_data(
                 _type=conf.weight_type,
                 shape=(
                     conf.out_channel,
@@ -333,14 +220,14 @@ def from_conf(
             groups=conf.in_channel if conf.depthwise else 1,
         ).type(torch.int64)
         # Use only the lower 32bits
-        output = Ne16TestGenerator._cast(
-            output, Ne16.ACCUMULATOR_TYPE, saturate=False
-        ).type(torch.int32)
+        output = NnxTestGenerator._cast(output, accumulator_type, saturate=False).type(
+            torch.int32
+        )
 
         if conf.has_norm_quant:
             if scale is None:
                 assert conf.scale_type is not None
-                scale = Ne16TestGenerator._random_data(
+                scale = NnxTestGenerator._random_data(
                     conf.scale_type, shape=(1, conf.out_channel, 1, 1)
                 )
             # Scale accumulators are in 48bit, so keeping the data in 64bit
@@ -350,16 +237,16 @@ def from_conf(
             if conf.has_bias:
                 # Saturating cast to int32
                 assert conf.bias_type is not None
-                output = Ne16TestGenerator._cast(
+                output = NnxTestGenerator._cast(
                     output, conf.bias_type, saturate=True
                 ).type(torch.int32)
 
                 if bias is None:
-                    bias = Ne16TestGenerator._random_data(
+                    bias = NnxTestGenerator._random_data(
                         conf.bias_type, shape=(1, conf.out_channel, 1, 1)
                     ).type(torch.int32)
                 output = output + bias
-                output = Ne16TestGenerator._cast(
+                output = NnxTestGenerator._cast(
                     output, conf.bias_type, saturate=False
                 ).type(torch.int32)
 
@@ -367,15 +254,15 @@ def from_conf(
                 output = F.relu(output)
 
             if global_shift is None:
-                global_shift = Ne16TestGenerator._global_shift(
+                global_shift = NnxTestGenerator._global_shift(
                     output, conf.out_type, conf.has_relu
                 )
             output = output >> global_shift
 
             # Saturate into out_type
-            output = Ne16TestGenerator._cast(output, conf.out_type, saturate=True)
+            output = NnxTestGenerator._cast(output, conf.out_type, saturate=True)
 
-        return Ne16Test(
+        return NnxTest(
             conf=conf,
             input=input,
             output=output,
@@ -386,28 +273,38 @@ def from_conf(
         )
 
     @staticmethod
-    def regenerate(test: Ne16Test, regen_tensors: Set[str]) -> Ne16Test:
+    def regenerate(test: NnxTest, regen_tensors: Set[str]) -> NnxTest:
         test_tensors = set(["input", "output", "weight", "scale", "bias"])
         load_tensors = test_tensors - regen_tensors
         kwargs = {tensor: getattr(test, tensor) for tensor in load_tensors}
-        return Ne16TestGenerator.from_conf(test.conf, **kwargs)
+        return NnxTestGenerator.from_conf(test.conf, **kwargs)
 
 
-class Ne16TestHeaderGenerator:
+class NnxTestHeaderGenerator:
     DEFAULT_HEADERS_DIR = "app/gen"
 
-    def __init__(self, headers_dir: Optional[Union[str, os.PathLike]] = None):
+    def __init__(
+        self,
+        weight_unroll: Callable[
+            [npt.NDArray[np.uint8], int, bool], npt.NDArray[np.uint8]
+        ],
+        headers_dir: Optional[Union[str, os.PathLike]] = None,
+    ):
         if headers_dir is None:
-            headers_dir = Ne16TestHeaderGenerator.DEFAULT_HEADERS_DIR
+            headers_dir = NnxTestHeaderGenerator.DEFAULT_HEADERS_DIR
         self.header_writer = HeaderWriter(headers_dir)
+        # function that takes the weights in CoutCinK format, bitwidth, and a depthwise flag,
+        # and returns a numpy array of dtype=np.uint8 of data in a layout correct for the accelerator
+        self.weight_unroll = weight_unroll
 
-    def generate(self, test_name: str, test: Ne16Test):
+    def generate(self, test_name: str, test: NnxTest):
         assert test.input is not None and test.output is not None
         _, in_channel, in_height, in_width = test.input.shape
         _, out_channel, out_height, out_width = test.output.shape
 
         # Render input
         in_ctype = test.conf.in_type.ctype()
+        in_signed = test.conf.in_type._signed
         in_data = test.input.permute(0, 2, 3, 1).ravel()
         self.header_writer.generate_vector_files(
             "input", _type=in_ctype, size=in_data.numel(), init=in_data
@@ -431,10 +328,10 @@ def generate(self, test_name: str, test: Ne16Test):
         weight_offset = -(2 ** (weight_bits - 1))
         weight_out_ch, weight_in_ch, weight_ks_h, weight_ks_w = test.weight.shape
         weight_data: np.ndarray = test.weight.numpy() - weight_offset
-        weight_init = Ne16.weight_unroll(
+        weight_init = self.weight_unroll(
             weight_data.astype(np.uint8),
             weight_type._bits,
-            depthwise=test.conf.depthwise,
+            test.conf.depthwise,
         )
         self.header_writer.generate_vector_files(
             "weight", _type="uint8_t", size=weight_init.size, init=weight_init
@@ -470,6 +367,7 @@ def generate(self, test_name: str, test: Ne16Test):
                     "height": in_height,
                     "width": in_width,
                     "channel": in_channel,
+                    "signed": in_signed,
                     "bits": 8,
                 },
                 "output": {
diff --git a/test/conf.toml b/test/conf.toml
index 1222f1d..c24055a 100644
--- a/test/conf.toml
+++ b/test/conf.toml
@@ -22,7 +22,7 @@
 # Ne16TestClasses.py:Ne16TestConf().check_valid()
 
 # Input dimensions
-in_height = 3
+in_height = 4
 in_width = 3
 in_channel = 8
 
diff --git a/test/conftest.py b/test/conftest.py
index 6c2c15b..b871141 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -18,7 +18,14 @@
 
 import os
 from typing import Union
-from Ne16TestClasses import Ne16Test, Ne16TestGenerator
+from Ne16 import Ne16
+from Ne16TestConf import Ne16TestConf
+from Neureka import Neureka
+from NeurekaTestConf import NeurekaTestConf
+from NnxTestClasses import NnxTest, NnxTestGenerator
+
+
+_SUPPORTED_ACCELERATORS = ["ne16", "neureka"]
 
 
 def pytest_addoption(parser):
@@ -39,6 +46,13 @@ def pytest_addoption(parser):
         default=False,
         help="Recursively search for tests in given test directories.",
     )
+    parser.addoption(
+        "-A",
+        "--accelerator",
+        choices=_SUPPORTED_ACCELERATORS,
+        default="ne16",
+        help="Choose an accelerator to test. Default: ne16",
+    )
     parser.addoption(
         "--regenerate",
         action="store_true",
@@ -54,7 +68,7 @@ def pytest_addoption(parser):
 
 
 def _find_test_dirs(path: Union[str, os.PathLike]):
-    return [dirpath for dirpath, _, _ in os.walk(path) if Ne16Test.is_test_dir(dirpath)]
+    return [dirpath for dirpath, _, _ in os.walk(path) if NnxTest.is_test_dir(dirpath)]
 
 
 def pytest_generate_tests(metafunc):
@@ -62,6 +76,7 @@ def pytest_generate_tests(metafunc):
     recursive = metafunc.config.getoption("recursive")
     regenerate = metafunc.config.getoption("regenerate")
     timeout = metafunc.config.getoption("timeout")
+    nnxName = metafunc.config.getoption("accelerator")
 
     if recursive:
         tests_dirs = test_dirs
@@ -71,10 +86,24 @@ def pytest_generate_tests(metafunc):
 
     # (Re)Generate test data
     for test_dir in test_dirs:
-        test = Ne16Test.load(test_dir)
+        test = NnxTest.load(Ne16TestConf, test_dir)
         if not test.is_valid() or regenerate:
-            test = Ne16TestGenerator.from_conf(test.conf)
+            test = NnxTestGenerator.from_conf(test.conf, Ne16.ACCUMULATOR_TYPE)
             test.save_data(test_dir)
 
+    if nnxName == "ne16":
+        nnxCls = Ne16
+        nnxTestConfCls = Ne16TestConf
+    elif nnxName == "neureka":
+        nnxCls = Neureka
+        nnxTestConfCls = NeurekaTestConf
+    else:
+        assert (
+            False
+        ), f"Given accelerator {nnxName} not supported. Supported accelerators: {_SUPPORTED_ACCELERATORS}"
+
     metafunc.parametrize("path", test_dirs)
     metafunc.parametrize("timeout", [timeout])
+    metafunc.parametrize("nnxName", [nnxName])
+    metafunc.parametrize("nnxCls", [nnxCls])
+    metafunc.parametrize("nnxTestConfCls", [nnxTestConfCls])
diff --git a/test/test.py b/test/test.py
index 39709b6..778c6ca 100644
--- a/test/test.py
+++ b/test/test.py
@@ -18,10 +18,12 @@
 
 import os
 import re
-from typing import Union, Optional, Tuple
+from typing import Dict, Union, Optional, Tuple, Type
 import locale
 import subprocess
-from Ne16TestClasses import Ne16Test, Ne16TestHeaderGenerator
+from Ne16 import Ne16
+from Neureka import Neureka
+from NnxTestClasses import NnxTest, NnxTestConf, NnxTestHeaderGenerator
 from pathlib import Path
 
 HORIZONTAL_LINE = "\n" + "-" * 100 + "\n"
@@ -49,17 +51,29 @@ def captured_output(
 
 
 def execute_command(
-    cmd: str, timeout: int = 30, cflags: Optional[str] = None
+    cmd: str,
+    timeout: int = 30,
+    cflags: Optional[str] = None,
+    envflags: Optional[Dict[str, str]] = None,
 ) -> Tuple[bool, str, str, Optional[str]]:
-    app_cflags = 'APP_CFLAGS="' + " ".join(cflags) + '" ' if cflags else ""
-    cmd = cmd + app_cflags
+    env = os.environ
+    if cflags:
+        env["APP_CFLAGS"] = '"' + " ".join(cflags) + '"'
+    if envflags:
+        for key, value in envflags.items():
+            env[key] = value
 
     status = None
     stdout = None
 
     try:
         proc = subprocess.run(
-            cmd.split(), check=True, capture_output=True, text=True, timeout=timeout
+            cmd.split(),
+            check=True,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+            env=env,
         )
         status = True
         msg = "OK"
@@ -94,15 +108,23 @@ def assert_message(
     return retval
 
 
-def test(path: str, timeout: int):
+def test(
+    path: str,
+    timeout: int,
+    nnxName: str,
+    nnxCls: Union[Type[Ne16], Type[Neureka]],
+    nnxTestConfCls: Type[NnxTestConf],
+):
     test_name = path
-    test = Ne16Test.load(path)
+    test = NnxTest.load(nnxTestConfCls, path)
 
-    Ne16TestHeaderGenerator().generate(test_name, test)
+    NnxTestHeaderGenerator(nnxCls.weight_unroll).generate(test_name, test)
 
     Path("app/src/nnx_layer.c").touch()
     cmd = f"make -C app all run platform=gvsoc"
-    passed, msg, stdout, stderr = execute_command(cmd=cmd, timeout=timeout)
+    passed, msg, stdout, stderr = execute_command(
+        cmd=cmd, timeout=timeout, envflags={"ACCELERATOR": nnxName}
+    )
 
     assert passed, assert_message(msg, test_name, cmd, stdout, stderr)
 
diff --git a/test/testgen.py b/test/testgen.py
index e748f2e..d27c28e 100644
--- a/test/testgen.py
+++ b/test/testgen.py
@@ -20,24 +20,36 @@
 import argparse
 import json
 import toml
-from typing import Optional, Union, Set
-from Ne16TestClasses import (
-    Ne16TestConf,
-    Ne16TestGenerator,
-    Ne16Test,
-    Ne16TestHeaderGenerator,
+from typing import Optional, Type, Union, Set
+from Ne16 import Ne16
+from Ne16TestConf import Ne16TestConf
+from Neureka import Neureka
+from NeurekaTestConf import NeurekaTestConf
+from NnxTestClasses import (
+    NnxTest,
+    NnxTestConf,
+    NnxTestGenerator,
+    NnxTestHeaderGenerator,
 )
 
 
-def headers_gen(args, test: Optional[Ne16Test] = None):
+def headers_gen(
+    args,
+    nnxCls: Union[Type[Ne16], Type[Neureka]],
+    nnxTestConfCls: Type[NnxTestConf],
+    test: Optional[NnxTest] = None,
+):
     if test is None:
-        test = Ne16Test.load(args.test_dir)
+        test = NnxTest.load(nnxTestConfCls, args.test_dir)
+    assert test is not None
     if not test.is_valid():
-        test = Ne16TestGenerator.from_conf(test.conf)
-    Ne16TestHeaderGenerator().generate(args.test_dir, test)
+        test = NnxTestGenerator.from_conf(test.conf, nnxCls.ACCUMULATOR_TYPE)
+    NnxTestHeaderGenerator(nnxCls.weight_unroll).generate(args.test_dir, test)
 
 
-def test_gen(args):
+def test_gen(
+    args, nnxCls: Union[Type[Ne16], Type[Neureka]], nnxTestConfCls: Type[NnxTestConf]
+):
     if args.conf.endswith(".toml"):
         test_conf_dict = toml.load(args.conf)
     elif args.conf.endswith(".json"):
@@ -49,37 +61,67 @@ def test_gen(args):
         )
         exit(-1)
 
-    test_conf = Ne16TestConf.model_validate(test_conf_dict)
-    test = Ne16TestGenerator.from_conf(test_conf)
+    test_conf = nnxTestConfCls.model_validate(test_conf_dict)
+    test = NnxTestGenerator.from_conf(test_conf, nnxCls.ACCUMULATOR_TYPE)
     if not args.skip_save:
         test.save(args.test_dir)
     if args.headers:
-        headers_gen(args, test)
+        headers_gen(args, nnxCls, nnxTestConfCls, test)
 
 
-def _regen(path: Union[str, os.PathLike], regen_tensors: Set[str]) -> None:
-    test = Ne16Test.load(path)
-    test = Ne16TestGenerator.regenerate(test, regen_tensors)
+def _regen(
+    path: Union[str, os.PathLike],
+    regen_tensors: Set[str],
+    nnxTestConfCls: Type[NnxTestConf],
+) -> None:
+    test = NnxTest.load(nnxTestConfCls, path)
+    test = NnxTestGenerator.regenerate(test, regen_tensors)
     test.save(path)
 
 
-def _regen_recursive(path: Union[str, os.PathLike], regen_tensors: Set[str]) -> None:
-    if Ne16Test.is_test_dir(path):
-        _regen(path, regen_tensors)
+def _regen_recursive(
+    path: Union[str, os.PathLike],
+    regen_tensors: Set[str],
+    nnxTestConfCls: Type[NnxTestConf],
+) -> None:
+    if NnxTest.is_test_dir(path):
+        _regen(path, regen_tensors, nnxTestConfCls)
         return
 
     for dirpath, _, _ in os.walk(path):
-        _regen_recursive(dirpath, regen_tensors)
+        _regen_recursive(dirpath, regen_tensors, nnxTestConfCls)
 
 
-def test_regen(args):
+def test_regen(
+    args, nnxCls: Union[Type[Ne16], Type[Neureka]], nnxTestConfCls: Type[NnxTestConf]
+):
+    _ = nnxCls
     regen_tensors = set(args.tensors + ["output"])
 
     for test_dir in args.test_dirs:
         if args.recurse:
-            _regen_recursive(test_dir, regen_tensors)
+            _regen_recursive(test_dir, regen_tensors, nnxTestConfCls)
         else:
-            _regen(test_dir, regen_tensors)
+            _regen(test_dir, regen_tensors, nnxTestConfCls)
+
+
+def add_common_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "-t",
+        "--test-dir",
+        type=str,
+        dest="test_dir",
+        required=True,
+        help="Path to the test.",
+    )
+
+    parser.add_argument(
+        "-a",
+        "--accelerator",
+        choices=["ne16", "neureka"],
+        default="ne16",
+        help="Choose an accelerator. Default: ne16",
+    )
 
 
 parser = argparse.ArgumentParser(
@@ -91,14 +133,7 @@ def test_regen(args):
 parser_header = subparsers.add_parser(
     "headers", description="Generate headers for a single test."
 )
-parser_header.add_argument(
-    "-t",
-    "--test-dir",
-    type=str,
-    dest="test_dir",
-    required=True,
-    help="Path to the test." "basename.",
-)
+add_common_arguments(parser_header)
 parser_header.set_defaults(func=headers_gen)
 
 parser_test = subparsers.add_parser(
@@ -112,14 +147,6 @@ def test_regen(args):
     required=True,
     help="Path to the configuration file.",
 )
-parser_test.add_argument(
-    "-t",
-    "--test-dir",
-    type=str,
-    dest="test_dir",
-    required=True,
-    help="Path to the test. " "basename.",
-)
 parser_test.add_argument(
     "--headers", action="store_true", default=False, help="Generate headers."
 )
@@ -130,6 +157,7 @@ def test_regen(args):
     dest="skip_save",
     help="Skip saving the test.",
 )
+add_common_arguments(parser_test)
 parser_test.set_defaults(func=test_gen)
 
 parser_regen = subparsers.add_parser("regen", description="Regenerate test tensors.")
@@ -138,25 +166,27 @@ def test_regen(args):
     type=str,
     nargs="?",
     default=[],
-    help="Tensors that should be regenerated. Output " "included by default.",
-)
-parser_regen.add_argument(
-    "-t",
-    "--test-dir",
-    action="append",
-    dest="test_dirs",
-    required=True,
-    help="Path to the test.",
+    help="Tensors that should be regenerated. Output included by default.",
 )
 parser_regen.add_argument(
     "-r",
     "--recursive",
     action="store_true",
     default=False,
-    help="Recursively search for test directiories " "inside given test directories.",
+    help="Recursively search for test directiories inside given test directories.",
 )
+add_common_arguments(parser_regen)
 parser_regen.set_defaults(func=test_regen)
 
 args = parser.parse_args()
 
-args.func(args)
+if args.accelerator == "ne16":
+    nnxCls = Ne16
+    nnxTestConfCls = Ne16TestConf
+elif args.accelerator == "neureka":
+    nnxCls = Neureka
+    nnxTestConfCls = NeurekaTestConf
+else:
+    assert False, f"Unsupported accelerator {args.accelerator}."
+
+args.func(args, nnxCls, nnxTestConfCls)

From 27664cd44b1035df117646fba2f5ad6b52e19889 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 18 Jan 2024 14:24:39 +0100
Subject: [PATCH 15/72] Remove stride2x2 for neureka

---
 inc/pulp_nnx_neureka.h     | 16 -----------
 neureka/hal/neureka_task.h |  8 ------
 src/pulp_nnx_neureka.c     | 55 --------------------------------------
 test/NeurekaTestConf.py    | 11 +-------
 test/app/src/nnx_layer.c   | 14 +++++-----
 5 files changed, 7 insertions(+), 97 deletions(-)

diff --git a/inc/pulp_nnx_neureka.h b/inc/pulp_nnx_neureka.h
index b811f25..25ef4a8 100644
--- a/inc/pulp_nnx_neureka.h
+++ b/inc/pulp_nnx_neureka.h
@@ -59,19 +59,3 @@ int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task);
  * Block until you can resolve the task.
  */
 void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task);
-
-/* Additional helper functions */
-
-/** neureka_nnx_dispatch_stride2x2
- *
- * It uses Neureka's 2x2 strided mode which reduces the number of writes Neureka
- * does. This mode doesn't stride the Neureka's subtile input pointer, so we
- * have to tile the tile to the subtile's spatial dimensions (in this case 3x3
- * output). Works only if the k_out is divisible by 2.
- */
-void neureka_nnx_dispatch_stride2x2(
-    neureka_dev_t *dev, neureka_task_t *task, const uint32_t w_in,
-    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
-    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
-    const uint8_t h_ker, const uint8_t w_ker);
diff --git a/neureka/hal/neureka_task.h b/neureka/hal/neureka_task.h
index 70b80e5..7360b1e 100644
--- a/neureka/hal/neureka_task.h
+++ b/neureka/hal/neureka_task.h
@@ -163,13 +163,5 @@ void neureka_task_set_dims(
     const uint32_t w_out_stride, const uint32_t k_out_stride,
     const uint8_t padding_top, const uint8_t padding_bottom,
     const uint8_t padding_right, const uint8_t padding_left);
-void neureka_task_set_dims_stride2x2(
-    neureka_task_t *task, const uint32_t h_in, const uint32_t w_in,
-    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
-    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
-    const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
-    const uint8_t padding_bottom, const uint8_t padding_right,
-    const uint8_t padding_left);
 
 #endif // !__NEUREKA_TASK_H__
diff --git a/src/pulp_nnx_neureka.c b/src/pulp_nnx_neureka.c
index c7e2c64..0abb845 100644
--- a/src/pulp_nnx_neureka.c
+++ b/src/pulp_nnx_neureka.c
@@ -74,58 +74,3 @@ void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task) {
     neureka_siracusa_event_wait_and_clear();
   }
 }
-
-static inline uint32_t _get_tile_ptr(uint32_t ptr, int i, int j, int size_i,
-                                     uint32_t size_j, uint32_t size_k,
-                                     uint32_t stride_j, uint32_t stride_k,
-                                     uint32_t overlap_i, uint32_t overlap_j,
-                                     uint32_t offset_i, uint32_t offset_j,
-                                     uint8_t data_size) {
-  return ptr +
-         (i * (size_i - overlap_i) - offset_i) * stride_j * stride_k *
-             data_size / 8 +
-         (j * (size_j - overlap_j) - offset_j) * stride_k * data_size / 8;
-}
-
-void neureka_nnx_dispatch_stride2x2(
-    neureka_dev_t *dev, neureka_task_t *task, const uint32_t w_in,
-    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
-    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
-    const uint8_t h_ker, const uint8_t w_ker) {
-  const uint8_t stride = 2;
-  const uint8_t bits = 8;
-
-  const uint32_t n_h = divnceil(h_out, stride);
-  const uint32_t n_w = divnceil(w_out, stride);
-  const uint32_t input_height_offset = h_out % stride == 1 ? stride : 0;
-  const uint32_t input_width_offset = w_out % stride == 1 ? stride : 0;
-  const uint32_t output_height_offset = h_out % stride == 1 ? 1 : 0;
-  const uint32_t output_width_offset = w_out % stride == 1 ? 1 : 0;
-
-  const uint32_t input_base = task->data.infeat_ptr;
-  const uint32_t output_base = task->data.outfeat_ptr;
-  const uint32_t tile_padding = task->data.cfg.padding;
-
-  for (int i = 0; i < n_h; i++) {
-    for (int j = 0; j < n_w; j++) {
-      task->data.infeat_ptr =
-          _get_tile_ptr(input_base, i, j, 3 + h_ker - 1, 3 + w_ker - 1, k_in,
-                        w_in_stride, k_in_stride, h_ker - stride,
-                        w_ker - stride, i == 0 ? 0 : input_height_offset,
-                        j == 0 ? 0 : input_width_offset, bits);
-      task->data.outfeat_ptr =
-          _get_tile_ptr(output_base, i, j, 2, 2, k_out, w_out_stride,
-                        k_out_stride, 0, 0, i == 0 ? 0 : output_height_offset,
-                        j == 0 ? 0 : output_width_offset, bits);
-
-      task->data.cfg.padding =
-          neureka_get_tile_padding(tile_padding, i, j, n_h, n_w);
-
-      // Altered dispatch to wait if cannot acquire
-      while (neureka_nnx_dispatch(dev, task)) {
-        neureka_siracusa_event_wait_and_clear();
-      }
-    }
-  }
-}
diff --git a/test/NeurekaTestConf.py b/test/NeurekaTestConf.py
index dad7fc4..8d677aa 100644
--- a/test/NeurekaTestConf.py
+++ b/test/NeurekaTestConf.py
@@ -36,9 +36,7 @@ def check_valid_kernel_shape(cls, v: KernelShape) -> KernelShape:
     @field_validator("stride")
     @classmethod
     def check_valid_stride(cls, v: Stride) -> Stride:
-        assert v == Stride(height=1, width=1) or v == Stride(
-            height=2, width=2
-        ), f"Unsupported stride {v}. Supported 1x1 and 2x2."
+        assert v == Stride(height=1, width=1), f"Unsupported stride {v}. Supported 1x1."
         return v
 
     @staticmethod
@@ -81,13 +79,6 @@ def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType
             NeurekaTestConf._check_type("bias_type", v, ["int32"])
         return v
 
-    @model_validator(mode="after")  # type: ignore
-    def check_valid_out_channel_with_stride_2x2(self) -> NeurekaTestConf:
-        assert implies(
-            self.stride == Stride(height=2, width=2), self.out_channel % 2 == 0
-        ), f"With stride 2x2 supported only even output channel sizes. Given output channel {self.out_channel}"
-        return self
-
     @model_validator(mode="after")  # type: ignore
     def check_valid_depthwise(self) -> NeurekaTestConf:
         assert implies(
diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index 15ff359..2ca522f 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -76,7 +76,6 @@ typedef neureka_siracusa_conf_t nnx_bsp_conf_t;
 
 #define nnx_task_init neureka_task_init
 #define nnx_task_set_dims neureka_task_set_dims
-#define nnx_task_set_dims_stride2x2 neureka_task_set_dims_stride2x2
 #define nnx_task_set_ptrs neureka_task_set_ptrs
 
 #define NNX_GVSOC_LOG_LEVEL NEUREKA_GVSOC_LOG_LEVEL_ALL
@@ -88,7 +87,6 @@ typedef neureka_siracusa_conf_t nnx_bsp_conf_t;
 
 #define nnx_init neureka_nnx_init
 #define nnx_dispatch_wait neureka_nnx_dispatch_wait
-#define nnx_dispatch_stride2x2 neureka_nnx_dispatch_stride2x2
 #define nnx_dispatch neureka_nnx_dispatch
 #define nnx_resolve_wait neureka_nnx_resolve_wait
 #define nnx_term neureka_nnx_term
@@ -117,18 +115,18 @@ static void task_prepare(nnx_task_t *task) {
                    .flag_shift = nnxTaskFlagFalse},
       STRIDE_HEIGHT);
 
-  if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) {
+#if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2
     nnx_task_set_dims_stride2x2(
         task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
         INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL,
         OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP,
         PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT);
-  } else {
+#else
     nnx_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
                       INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
                       OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, PADDING_TOP,
                       PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT);
-  }
+#endif
 
   nnx_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, INPUT_CHANNEL,
                     INPUT_BITS, PADDING_TOP, PADDING_LEFT, (uint32_t)output,
@@ -151,14 +149,14 @@ static void task_execute(nnx_task_t *task) {
 
   nnx_dispatch_wait(dev);
 
-  if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) {
+#if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2
     nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
                            INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
                            OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL,
                            WEIGHT_HEIGHT, WEIGHT_WIDTH);
-  } else {
+#else
     nnx_dispatch(dev, task);
-  }
+#endif
 
   nnx_resolve_wait(dev, task);
 

From ccc835b9fb5eb10c0566b1070a12ab076d9b1dfc Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 18 Jan 2024 14:29:23 +0100
Subject: [PATCH 16/72] Fix formatting

---
 neureka/hal/neureka_task.c      | 44 ++++++++++++++++++---------------
 neureka/hal/neureka_task_defs.h |  5 ++--
 test/app/src/main.c             | 12 ++++-----
 test/app/src/nnx_layer.c        | 28 ++++++++++-----------
 4 files changed, 47 insertions(+), 42 deletions(-)

diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
index 941fcde..2097af7 100644
--- a/neureka/hal/neureka_task.c
+++ b/neureka/hal/neureka_task.c
@@ -48,19 +48,19 @@ void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
                        const uint32_t weights_offset_factor,
                        neureka_quant_t quant, neureka_norm_t norm,
                        const uint8_t stride) {
-  *task = (neureka_task_t){
-      .outbytes = output_bits / 8,
-      .qw = weights_bits,
-      .stride_shift = stride == 2 ? 1 : 0,
-      .output_channel_throughput = depthwise
-                                       ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3
-                                       : NEUREKA_OUTPUT_CHANNEL_THROUGHPUT,
-      .input_channel_throughput = kernel_shape == 3
-                                      ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3
-                                      : NEUREKA_INPUT_CHANNEL_THROUGHPUT_1x1,
-      .kernel_shape = kernel_shape,
-      .depthwise = depthwise,
-      .data = {0}};
+  *task = (neureka_task_t){.outbytes = output_bits / 8,
+                           .qw = weights_bits,
+                           .stride_shift = stride == 2 ? 1 : 0,
+                           .output_channel_throughput =
+                               depthwise ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3
+                                         : NEUREKA_OUTPUT_CHANNEL_THROUGHPUT,
+                           .input_channel_throughput =
+                               kernel_shape == 3
+                                   ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3
+                                   : NEUREKA_INPUT_CHANNEL_THROUGHPUT_1x1,
+                           .kernel_shape = kernel_shape,
+                           .depthwise = depthwise,
+                           .data = {0}};
 
   const int flag_mode = kernel_shape == 1 ? NEUREKA_FLAG_MODE_1x1
                         : depthwise == 1  ? NEUREKA_FLAG_MODE_3x3_DW
@@ -122,15 +122,15 @@ void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
   const neureka_stride_t output_stride = {
       .d0 = 32, // TODO: should depend on outbytes. Probably 32 / outbytes
       .d1 = k_out_stride * task->outbytes,
-      .d2 = k_out_stride * task->outbytes * w_out_stride
-  };
+      .d2 = k_out_stride * task->outbytes * w_out_stride};
   task->data.cfg.output_stride = output_stride;
 
   task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_D0_STRIDE;
   if (task->kernel_shape == 1) { // 1x1
     task->data.cfg.weights_stride.d1 = NEUREKA_WEIGHT_D0_STRIDE * num_k_in;
   } else if (!task->depthwise) { // 3x3
-    task->data.cfg.weights_stride.d1 = NEUREKA_WEIGHT_D0_STRIDE * task->qw * num_k_in;
+    task->data.cfg.weights_stride.d1 =
+        NEUREKA_WEIGHT_D0_STRIDE * task->qw * num_k_in;
   } else { // 3x3 depthwise
     task->data.cfg.weights_stride.d1 = 0;
   }
@@ -151,10 +151,14 @@ void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
   const uint16_t rem_Ki = remainder(k_in, task->input_channel_throughput);
   const uint16_t rem_Ho = remainder(h_out, NEUREKA_COMPUTE_SIZE_HEIGHT);
   const uint16_t rem_Wo = remainder(w_out, NEUREKA_COMPUTE_SIZE_WIDTH);
-  const uint16_t rem_Hi = rem_Ho == 0 ? 0 : (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) -
-                          padding_bottom; // TODO: Check padding bottom
-  const uint16_t rem_Wi = rem_Wo == 0 ? 0 : (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) -
-                          padding_right; // TODO: Check padding right
+  const uint16_t rem_Hi =
+      rem_Ho == 0 ? 0
+                  : (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) -
+                        padding_bottom; // TODO: Check padding bottom
+  const uint16_t rem_Wi =
+      rem_Wo == 0 ? 0
+                  : (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) -
+                        padding_right; // TODO: Check padding right
 
   const neureka_subtile_t subtile = {
       .number = {.KoKi = concat_half(num_Ko, num_Ki),
diff --git a/neureka/hal/neureka_task_defs.h b/neureka/hal/neureka_task_defs.h
index f720061..952d7b2 100644
--- a/neureka/hal/neureka_task_defs.h
+++ b/neureka/hal/neureka_task_defs.h
@@ -78,7 +78,8 @@
 #define NEUREKA_QUANT_MODE_32BIT (2 << 21)
 // conf0[20:16] - quantization shift amount
 #define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15) // Unimplemented in gvsoc
-#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15) // Unimplemented in gvsoc
+#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE                                  \
+  (1 << 15) // Unimplemented in gvsoc
 #define NEUREKA_FLAG_STREAMIN (1 << 14)
 #define NEUREKA_NORM_MODE_8BIT (0 << 12)
 #define NEUREKA_NORM_MODE_16BIT (1 << 12) // not supported
@@ -87,7 +88,7 @@
 #define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10)
 #define NEUREKA_FLAG_USE_WMEM (1 << 9)
 #define NEUREKA_FLAG_USE_TCDM (0 << 9)
-#define NEUREKA_FLAG_STRIDE_2x2 (1 << 8) // not supported
+#define NEUREKA_FLAG_STRIDE_2x2 (1 << 8)  // not supported
 #define NEUREKA_FLAG_LINEAR_MODE (1 << 7) // not supported
 #define NEUREKA_FLAG_MODE_3x3 (0 << 5)
 #define NEUREKA_FLAG_MODE_3x3_DW (1 << 5)
diff --git a/test/app/src/main.c b/test/app/src/main.c
index db32e3f..ecc9dd4 100644
--- a/test/app/src/main.c
+++ b/test/app/src/main.c
@@ -20,18 +20,18 @@
 
 #include <pmsis.h>
 
+#include "bias.h"
+#include "input.h"
 #include "layer_util.h"
 #include "nnx_layer.h"
 #include "output.h"
-#include "input.h"
-#include "bias.h"
 #include "scale.h"
 #include "weight.h"
 
-#define NNX_MEMCPY(dst, src, size) \
-for (int i = 0; i < size; i++) { \
-  dst[i] = src[i]; \
-}
+#define NNX_MEMCPY(dst, src, size)                                             \
+  for (int i = 0; i < size; i++) {                                             \
+    dst[i] = src[i];                                                           \
+  }
 
 int main() {
   struct pi_device cl_dev;
diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index 2ca522f..eb6f0ff 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -116,16 +116,16 @@ static void task_prepare(nnx_task_t *task) {
       STRIDE_HEIGHT);
 
 #if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2
-    nnx_task_set_dims_stride2x2(
-        task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
-        INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL,
-        OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP,
-        PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT);
+  nnx_task_set_dims_stride2x2(
+      task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
+      INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, OUTPUT_WIDTH,
+      OUTPUT_CHANNEL, WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP, PADDING_BOTTOM,
+      PADDING_RIGHT, PADDING_LEFT);
 #else
-    nnx_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
-                      INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
-                      OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, PADDING_TOP,
-                      PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT);
+  nnx_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
+                    INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL,
+                    OUTPUT_WIDTH, OUTPUT_CHANNEL, PADDING_TOP, PADDING_BOTTOM,
+                    PADDING_RIGHT, PADDING_LEFT);
 #endif
 
   nnx_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, INPUT_CHANNEL,
@@ -150,12 +150,12 @@ static void task_execute(nnx_task_t *task) {
   nnx_dispatch_wait(dev);
 
 #if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2
-    nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
-                           INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
-                           OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL,
-                           WEIGHT_HEIGHT, WEIGHT_WIDTH);
+  nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
+                         INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
+                         OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL,
+                         WEIGHT_HEIGHT, WEIGHT_WIDTH);
 #else
-    nnx_dispatch(dev, task);
+  nnx_dispatch(dev, task);
 #endif
 
   nnx_resolve_wait(dev, task);

From df0eb6bf70a6313de3f9a23007c62bd6be8bf8f2 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 18 Jan 2024 14:54:15 +0100
Subject: [PATCH 17/72] Skip invalid tests

---
 test/conftest.py | 39 ++++++++++++++++++++++++---------------
 test/test.py     | 15 ++++++---------
 2 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/test/conftest.py b/test/conftest.py
index b871141..f5d15db 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -18,6 +18,8 @@
 
 import os
 from typing import Union
+
+import pydantic
 from Ne16 import Ne16
 from Ne16TestConf import Ne16TestConf
 from Neureka import Neureka
@@ -78,19 +80,6 @@ def pytest_generate_tests(metafunc):
     timeout = metafunc.config.getoption("timeout")
     nnxName = metafunc.config.getoption("accelerator")
 
-    if recursive:
-        tests_dirs = test_dirs
-        test_dirs = []
-        for tests_dir in tests_dirs:
-            test_dirs.extend(_find_test_dirs(tests_dir))
-
-    # (Re)Generate test data
-    for test_dir in test_dirs:
-        test = NnxTest.load(Ne16TestConf, test_dir)
-        if not test.is_valid() or regenerate:
-            test = NnxTestGenerator.from_conf(test.conf, Ne16.ACCUMULATOR_TYPE)
-            test.save_data(test_dir)
-
     if nnxName == "ne16":
         nnxCls = Ne16
         nnxTestConfCls = Ne16TestConf
@@ -102,8 +91,28 @@ def pytest_generate_tests(metafunc):
             False
         ), f"Given accelerator {nnxName} not supported. Supported accelerators: {_SUPPORTED_ACCELERATORS}"
 
-    metafunc.parametrize("path", test_dirs)
+    if recursive:
+        tests_dirs = test_dirs
+        test_dirs = []
+        for tests_dir in tests_dirs:
+            test_dirs.extend(_find_test_dirs(tests_dir))
+
+    # Load valid tests
+    valid_paths = []
+    valid_tests = []
+    for test_dir in test_dirs:
+        try:
+            test = NnxTest.load(nnxTestConfCls, test_dir)
+            # (Re)generate data
+            if not test.is_valid() or regenerate:
+                test = NnxTestGenerator.from_conf(test.conf, nnxCls.ACCUMULATOR_TYPE)
+                test.save_data(test_dir)
+            valid_tests.append(test)
+            valid_paths.append(test_dir)
+        except pydantic.ValidationError as e:
+            _ = e
+
+    metafunc.parametrize("nnxTestAndName", zip(valid_tests, valid_paths))
     metafunc.parametrize("timeout", [timeout])
     metafunc.parametrize("nnxName", [nnxName])
     metafunc.parametrize("nnxCls", [nnxCls])
-    metafunc.parametrize("nnxTestConfCls", [nnxTestConfCls])
diff --git a/test/test.py b/test/test.py
index 778c6ca..542a937 100644
--- a/test/test.py
+++ b/test/test.py
@@ -109,16 +109,13 @@ def assert_message(
 
 
 def test(
-    path: str,
+    nnxTestAndName: Tuple[NnxTest, str],
     timeout: int,
     nnxName: str,
     nnxCls: Union[Type[Ne16], Type[Neureka]],
-    nnxTestConfCls: Type[NnxTestConf],
 ):
-    test_name = path
-    test = NnxTest.load(nnxTestConfCls, path)
-
-    NnxTestHeaderGenerator(nnxCls.weight_unroll).generate(test_name, test)
+    nnxTest, nnxTestName = nnxTestAndName
+    NnxTestHeaderGenerator(nnxCls.weight_unroll).generate(nnxTestName, nnxTest)
 
     Path("app/src/nnx_layer.c").touch()
     cmd = f"make -C app all run platform=gvsoc"
@@ -126,18 +123,18 @@ def test(
         cmd=cmd, timeout=timeout, envflags={"ACCELERATOR": nnxName}
     )
 
-    assert passed, assert_message(msg, test_name, cmd, stdout, stderr)
+    assert passed, assert_message(msg, nnxTestName, cmd, stdout, stderr)
 
     match_success = re.search(r"> Success! No errors found.", stdout)
     match_fail = re.search(r"> Failure! Found (\d*)/(\d*) errors.", stdout)
 
     assert match_success or match_fail, assert_message(
-        "No regexes matched.", test_name, cmd, stdout
+        "No regexes matched.", nnxTestName, cmd, stdout
     )
 
     assert not match_fail, assert_message(
         f"Errors found: {match_fail.group(1)}/{match_fail.group(2)}",
-        test_name,
+        nnxTestName,
         cmd,
         stdout,
     )

From 31dc36e93abf1fb656d1fcbf2b9c348ab847240b Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 18 Jan 2024 15:17:34 +0100
Subject: [PATCH 18/72] Change invalid tes tskip to explicit pytest skip with
 reason

---
 test/conftest.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/conftest.py b/test/conftest.py
index f5d15db..26a2866 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -19,6 +19,7 @@
 import os
 from typing import Union
 
+import pytest
 import pydantic
 from Ne16 import Ne16
 from Ne16TestConf import Ne16TestConf
@@ -98,8 +99,7 @@ def pytest_generate_tests(metafunc):
             test_dirs.extend(_find_test_dirs(tests_dir))
 
     # Load valid tests
-    valid_paths = []
-    valid_tests = []
+    nnxTestAndNames = []
     for test_dir in test_dirs:
         try:
             test = NnxTest.load(nnxTestConfCls, test_dir)
@@ -107,12 +107,12 @@ def pytest_generate_tests(metafunc):
             if not test.is_valid() or regenerate:
                 test = NnxTestGenerator.from_conf(test.conf, nnxCls.ACCUMULATOR_TYPE)
                 test.save_data(test_dir)
-            valid_tests.append(test)
-            valid_paths.append(test_dir)
+            nnxTestAndNames.append((test, test_dir))
         except pydantic.ValidationError as e:
             _ = e
+            nnxTestAndNames.append(pytest.param((None, test_dir), marks=pytest.mark.skipif(True, reason=f"Invalid test {test_dir}: {e.errors}")))
 
-    metafunc.parametrize("nnxTestAndName", zip(valid_tests, valid_paths))
+    metafunc.parametrize("nnxTestAndName", nnxTestAndNames)
     metafunc.parametrize("timeout", [timeout])
     metafunc.parametrize("nnxName", [nnxName])
     metafunc.parametrize("nnxCls", [nnxCls])

From 7bae33cb70579633136aa08aa3f6e47ba9fed2fe Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 18 Jan 2024 15:29:08 +0100
Subject: [PATCH 19/72] Add neureka test to the CI

---
 .gitlab-ci.yml | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b595682..b8357a4 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -34,11 +34,20 @@ static_check_python:
   script:
     - pyright .
 
-run_test0:
+run_ne16_test:
   stage: test
   tags:
     - gap9-sdk
   artifacts:
     untracked: true
   script:
-    - cd test && pytest test.py --test-dir tests --recursive
+    - cd test && pytest test.py --test-dir tests --recursive -A ne16
+
+run_neureka_test:
+  stage: test
+  tags:
+    - siracusa-sdk
+  artifacts:
+    untracked: true
+  script:
+    - cd test && pytest test.py --test-dir tests --recursive -A neureka

From a0080c6753fbf76ea945ea046a605d082814c7f2 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 18 Jan 2024 15:29:55 +0100
Subject: [PATCH 20/72] Fix formatting

---
 test/conftest.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/test/conftest.py b/test/conftest.py
index 26a2866..8f745e7 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -110,7 +110,14 @@ def pytest_generate_tests(metafunc):
             nnxTestAndNames.append((test, test_dir))
         except pydantic.ValidationError as e:
             _ = e
-            nnxTestAndNames.append(pytest.param((None, test_dir), marks=pytest.mark.skipif(True, reason=f"Invalid test {test_dir}: {e.errors}")))
+            nnxTestAndNames.append(
+                pytest.param(
+                    (None, test_dir),
+                    marks=pytest.mark.skipif(
+                        True, reason=f"Invalid test {test_dir}: {e.errors}"
+                    ),
+                )
+            )
 
     metafunc.parametrize("nnxTestAndName", nnxTestAndNames)
     metafunc.parametrize("timeout", [timeout])

From be50522461137e0aa80fcfff898a07637198be41 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 19 Jan 2024 10:01:53 +0100
Subject: [PATCH 21/72] Add --print-tensors flag

---
 test/NnxTestClasses.py |  5 +++++
 test/testgen.py        | 26 +++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/test/NnxTestClasses.py b/test/NnxTestClasses.py
index ed1b55e..57c4f23 100644
--- a/test/NnxTestClasses.py
+++ b/test/NnxTestClasses.py
@@ -179,6 +179,7 @@ def from_conf(
         scale: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
         global_shift: Optional[torch.Tensor] = None,
+        verbose: bool = False,
     ) -> NnxTest:
         torch.manual_seed(NnxTestGenerator._DEFAULT_SEED)
 
@@ -224,6 +225,10 @@ def from_conf(
             torch.int32
         )
 
+        if verbose:
+            print("INTERMEDIATE RESULTS (pre-normalization/requant):")
+            print(output)
+
         if conf.has_norm_quant:
             if scale is None:
                 assert conf.scale_type is not None
diff --git a/test/testgen.py b/test/testgen.py
index d27c28e..ad4f315 100644
--- a/test/testgen.py
+++ b/test/testgen.py
@@ -47,6 +47,21 @@ def headers_gen(
     NnxTestHeaderGenerator(nnxCls.weight_unroll).generate(args.test_dir, test)
 
 
+def print_tensors(test: NnxTest):
+    print("INPUT TENSOR:")
+    print(test.input)
+    print("WEIGHT TENSOR:")
+    print(test.weight)
+    print("SCALE TENSOR:")
+    print(test.scale)
+    print("BIAS TENSOR:")
+    print(test.bias)
+    print("GLOBAL SHIFT TENSOR:")
+    print(test.global_shift)
+    print("EXPECTED OUTPUT TENSOR:")
+    print(test.output)
+
+
 def test_gen(
     args, nnxCls: Union[Type[Ne16], Type[Neureka]], nnxTestConfCls: Type[NnxTestConf]
 ):
@@ -62,11 +77,13 @@ def test_gen(
         exit(-1)
 
     test_conf = nnxTestConfCls.model_validate(test_conf_dict)
-    test = NnxTestGenerator.from_conf(test_conf, nnxCls.ACCUMULATOR_TYPE)
+    test = NnxTestGenerator.from_conf(test_conf, nnxCls.ACCUMULATOR_TYPE, verbose=args.print_tensors)
     if not args.skip_save:
         test.save(args.test_dir)
     if args.headers:
         headers_gen(args, nnxCls, nnxTestConfCls, test)
+    if args.print_tensors:
+        print_tensors(test)
 
 
 def _regen(
@@ -157,6 +174,13 @@ def add_common_arguments(parser: argparse.ArgumentParser):
     dest="skip_save",
     help="Skip saving the test.",
 )
+parser_test.add_argument(
+    "--print-tensors",
+    action="store_true",
+    default=False,
+    dest="print_tensors",
+    help="Print tensor values to stdout.",
+)
 add_common_arguments(parser_test)
 parser_test.set_defaults(func=test_gen)
 

From 2c07dbd6331e7ea0baa8164f2be8efcea7b39b56 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 19 Jan 2024 12:43:31 +0100
Subject: [PATCH 22/72] Remove memcpys since it was a linker script bug

---
 test/app/src/main.c | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/test/app/src/main.c b/test/app/src/main.c
index ecc9dd4..7af084c 100644
--- a/test/app/src/main.c
+++ b/test/app/src/main.c
@@ -28,27 +28,18 @@
 #include "scale.h"
 #include "weight.h"
 
-#define NNX_MEMCPY(dst, src, size)                                             \
-  for (int i = 0; i < size; i++) {                                             \
-    dst[i] = src[i];                                                           \
-  }
-
 int main() {
   struct pi_device cl_dev;
   struct pi_cluster_conf cl_conf;
   struct pi_cluster_task cl_task;
 
-  printf("\n");
-  printf("Test %s starting\n", TEST_NAME);
+  printf("\nTest %s starting\n", TEST_NAME);
+
+  printf("\nAccelerator: %s\n", NNX_NEUREKA ? "neureka" : "ne16");
 
   printf("\n");
   layer_info();
 
-  NNX_MEMCPY(input, input_l2, INPUT_SIZE);
-  NNX_MEMCPY(bias, bias_l2, BIAS_SIZE);
-  NNX_MEMCPY(scale, scale_l2, SCALE_SIZE);
-  NNX_MEMCPY(weight, weight_l2, WEIGHT_SIZE);
-
   pi_cluster_conf_init(&cl_conf);
   pi_open_from_conf(&cl_dev, &cl_conf);
   if (pi_cluster_open(&cl_dev)) {
@@ -59,11 +50,9 @@ int main() {
       &cl_dev, pi_cluster_task(&cl_task, execute_nnx_layer, NULL));
   pi_cluster_close(&cl_dev);
 
-  printf("\n");
-  printf("Test %s finished\n", TEST_NAME);
+  printf("\nTest %s finished\n", TEST_NAME);
 
   printf("\n");
-  NNX_MEMCPY(output_l2, output, OUTPUT_SIZE);
   check_output();
 
   return 0;

From 6a82ff57e4b6ae433c807ee8ed9a0f84fa5cc984 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 19 Jan 2024 13:05:53 +0100
Subject: [PATCH 23/72] Add Application section to test's readme

---
 test/README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/README.md b/test/README.md
index c3d29c5..8442493 100644
--- a/test/README.md
+++ b/test/README.md
@@ -35,3 +35,9 @@ $ pytest test.py --help
 - [testgen.py](testgen.py): collection of helper tools for individual tests
 
 For more information you can run the script with the `-h` flag.
+
+## Application
+
+The Makefile in the `app/` uses a flag `ACCELERATOR` to decide which accelerator to use.
+The choices are _ne16_ or _neureka_.
+You can either export it or run it like `ACCELERATOR=<accelerator> make clean all run`.

From 295ce909171a53623cb373c815b9e41c13a047bf Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 19 Jan 2024 13:06:22 +0100
Subject: [PATCH 24/72] Fix formatting

---
 test/testgen.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/testgen.py b/test/testgen.py
index ad4f315..43b4160 100644
--- a/test/testgen.py
+++ b/test/testgen.py
@@ -77,7 +77,9 @@ def test_gen(
         exit(-1)
 
     test_conf = nnxTestConfCls.model_validate(test_conf_dict)
-    test = NnxTestGenerator.from_conf(test_conf, nnxCls.ACCUMULATOR_TYPE, verbose=args.print_tensors)
+    test = NnxTestGenerator.from_conf(
+        test_conf, nnxCls.ACCUMULATOR_TYPE, verbose=args.print_tensors
+    )
     if not args.skip_save:
         test.save(args.test_dir)
     if args.headers:

From dc2409ca6e3e1a4ba2258194bf2fa7863e43e763 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 19 Jan 2024 13:11:43 +0100
Subject: [PATCH 25/72] Fix accelerator name printing

---
 test/app/Makefile   | 2 +-
 test/app/src/main.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/app/Makefile b/test/app/Makefile
index 493c092..75fc343 100644
--- a/test/app/Makefile
+++ b/test/app/Makefile
@@ -61,7 +61,7 @@ APP_SRCS += $(wildcard gen/src/*.c)
 # Flags
 
 ACCELERATOR_UPPERCASE := $(shell echo $(ACCELERATOR) | tr [:lower:] [:upper:])
-APP_CFLAGS += -DNNX_ACCELERATOR=$(ACCELERATOR) -DNNX_$(ACCELERATOR_UPPERCASE)
+APP_CFLAGS += -DNNX_ACCELERATOR=\"$(ACCELERATOR)\" -DNNX_$(ACCELERATOR_UPPERCASE)
 
 APP_CFLAGS += -O2 -w -Wall -Werror -flto
 APP_LDFLAGS += -flto
diff --git a/test/app/src/main.c b/test/app/src/main.c
index 7af084c..a2e7fd5 100644
--- a/test/app/src/main.c
+++ b/test/app/src/main.c
@@ -35,7 +35,7 @@ int main() {
 
   printf("\nTest %s starting\n", TEST_NAME);
 
-  printf("\nAccelerator: %s\n", NNX_NEUREKA ? "neureka" : "ne16");
+  printf("\nAccelerator: %s\n", NNX_ACCELERATOR);
 
   printf("\n");
   layer_info();

From d6ba62068f95ff62191b7ed9d8b2265701ad5985 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 19 Jan 2024 13:46:31 +0100
Subject: [PATCH 26/72] Add input_signed to neureka

---
 neureka/hal/neureka_task.c      |  4 ++--
 neureka/hal/neureka_task.h      |  4 +---
 neureka/hal/neureka_task_defs.h |  3 ++-
 test/NeurekaTestConf.py         |  2 +-
 test/app/src/nnx_layer.c        |  7 ++++++-
 test/tests/test_102/conf.json   | 29 +++++++++++++++++++++++++++++
 test/tests/test_103/conf.json   | 29 +++++++++++++++++++++++++++++
 test/tests/test_104/conf.json   | 29 +++++++++++++++++++++++++++++
 test/tests/test_105/conf.json   | 29 +++++++++++++++++++++++++++++
 9 files changed, 128 insertions(+), 8 deletions(-)
 create mode 100644 test/tests/test_102/conf.json
 create mode 100644 test/tests/test_103/conf.json
 create mode 100644 test/tests/test_104/conf.json
 create mode 100644 test/tests/test_105/conf.json

diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
index 2097af7..1b210ec 100644
--- a/neureka/hal/neureka_task.c
+++ b/neureka/hal/neureka_task.c
@@ -47,10 +47,9 @@ void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
                        const neureka_weight_offset_mode_e weights_offset_mode,
                        const uint32_t weights_offset_factor,
                        neureka_quant_t quant, neureka_norm_t norm,
-                       const uint8_t stride) {
+                       const uint8_t flag_input_signed) {
   *task = (neureka_task_t){.outbytes = output_bits / 8,
                            .qw = weights_bits,
-                           .stride_shift = stride == 2 ? 1 : 0,
                            .output_channel_throughput =
                                depthwise ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3
                                          : NEUREKA_OUTPUT_CHANNEL_THROUGHPUT,
@@ -67,6 +66,7 @@ void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
                                           : NEUREKA_FLAG_MODE_3x3;
 
   task->data.cfg.conf0 |=
+      flag_input_signed << NEUREKA_SHIFT_FLAG_INPUT_SIGNED |
       NEUREKA_FLAG_NORM_QUANT | quant.function | quant.mode |
       (quant.shift_amount << 16) |
       quant.flag_rounding << NEUREKA_SHIFT_ROUNDING | norm.mode |
diff --git a/neureka/hal/neureka_task.h b/neureka/hal/neureka_task.h
index 7360b1e..64356e6 100644
--- a/neureka/hal/neureka_task.h
+++ b/neureka/hal/neureka_task.h
@@ -111,9 +111,7 @@ typedef struct neureka_task_data_t {
 typedef struct neureka_task_t {
   neureka_task_data_t data;
   uint8_t outbytes;
-  uint8_t weight_d0_stride;
   uint8_t qw;
-  uint8_t stride_shift;
   uint8_t output_channel_throughput;
   uint8_t input_channel_throughput;
   uint8_t kernel_shape;
@@ -127,7 +125,7 @@ void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
                        const neureka_weight_offset_mode_e weights_offset_mode,
                        const uint32_t weights_offset_factor,
                        neureka_quant_t quant, neureka_norm_t norm,
-                       const uint8_t stride);
+                       const uint8_t flag_input_signed);
 uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
                                   uint32_t i_width, uint32_t n_height,
                                   uint32_t n_width);
diff --git a/neureka/hal/neureka_task_defs.h b/neureka/hal/neureka_task_defs.h
index 952d7b2..7ed77eb 100644
--- a/neureka/hal/neureka_task_defs.h
+++ b/neureka/hal/neureka_task_defs.h
@@ -61,6 +61,7 @@
 
 /*  SHIFT  */
 
+#define NEUREKA_SHIFT_FLAG_INPUT_SIGNED (26)
 #define NEUREKA_SHIFT_FLAG_NORM_BIAS (25)
 #define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24)
 #define NEUREKA_SHIFT_QUANT_SHIFT (16)
@@ -68,7 +69,7 @@
 
 /*  CONF0 FLAGS */
 
-#define NEUREKA_FLAG_SIGNED_ACTIVATION (1 << 26)
+#define NEUREKA_FLAG_INPUT_SIGNED (1 << 26)
 #define NEUREKA_FLAG_NORM_BIAS (1 << 25)
 #define NEUREKA_FLAG_NORM_SHIFT (1 << 24)
 #define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23)
diff --git a/test/NeurekaTestConf.py b/test/NeurekaTestConf.py
index 8d677aa..8fe8e68 100644
--- a/test/NeurekaTestConf.py
+++ b/test/NeurekaTestConf.py
@@ -50,7 +50,7 @@ def _check_type(
     @field_validator("in_type")
     @classmethod
     def check_valid_in_type(cls, v: IntegerType) -> IntegerType:
-        NeurekaTestConf._check_type("in_type", v, ["uint8"])
+        NeurekaTestConf._check_type("in_type", v, ["uint8", "int8"])
         return v
 
     @field_validator("out_type")
diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index eb6f0ff..003e55e 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -113,7 +113,12 @@ static void task_prepare(nnx_task_t *task) {
       (nnx_norm_t){.mode = normMode8Bit,
                    .flag_bias = HAS_BIAS ? nnxTaskFlagTrue : nnxTaskFlagFalse,
                    .flag_shift = nnxTaskFlagFalse},
-      STRIDE_HEIGHT);
+#ifdef NNX_NE16
+      STRIDE_HEIGHT
+#elif NNX_NEUREKA
+      INPUT_SIGNED
+#endif
+  );
 
 #if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2
   nnx_task_set_dims_stride2x2(
diff --git a/test/tests/test_102/conf.json b/test/tests/test_102/conf.json
new file mode 100644
index 0000000..d6d0c17
--- /dev/null
+++ b/test/tests/test_102/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 4,
+    "in_width": 3,
+    "in_channel": 8,
+    "out_channel": 8,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "int8",
+    "out_type": "uint8",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": true,
+    "has_bias": true,
+    "has_relu": true
+}
\ No newline at end of file
diff --git a/test/tests/test_103/conf.json b/test/tests/test_103/conf.json
new file mode 100644
index 0000000..3eff547
--- /dev/null
+++ b/test/tests/test_103/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 20,
+    "in_width": 15,
+    "in_channel": 40,
+    "out_channel": 25,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "int8",
+    "out_type": "uint8",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": true,
+    "has_bias": true,
+    "has_relu": true
+}
\ No newline at end of file
diff --git a/test/tests/test_104/conf.json b/test/tests/test_104/conf.json
new file mode 100644
index 0000000..d6d00e4
--- /dev/null
+++ b/test/tests/test_104/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 20,
+    "in_width": 15,
+    "in_channel": 40,
+    "out_channel": 25,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 1,
+        "width": 1
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "int8",
+    "out_type": "uint8",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": true,
+    "has_bias": true,
+    "has_relu": true
+}
\ No newline at end of file
diff --git a/test/tests/test_105/conf.json b/test/tests/test_105/conf.json
new file mode 100644
index 0000000..0f34422
--- /dev/null
+++ b/test/tests/test_105/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 20,
+    "in_width": 15,
+    "in_channel": 40,
+    "out_channel": 40,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": true,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "int8",
+    "out_type": "uint8",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": true,
+    "has_bias": true,
+    "has_relu": true
+}
\ No newline at end of file

From 0934172054a78e4482005fe75319628c7c337363 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 19 Jan 2024 18:17:25 +0100
Subject: [PATCH 27/72] Add readme per accelerator

---
 README.md         | 42 ++----------------------------------------
 ne16/README.md    | 37 +++++++++++++++++++++++++++++++++++++
 neureka/README.md | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+), 40 deletions(-)
 create mode 100644 ne16/README.md
 create mode 100644 neureka/README.md

diff --git a/README.md b/README.md
index ea4c6a8..0daa8fd 100644
--- a/README.md
+++ b/README.md
@@ -39,46 +39,8 @@ _Note: The accelerator can provide additional helper functions if needed._
 
 ## Accelerators
 
-### NE16
-
-Github repo [link](https://github.com/pulp-platform/ne16).
-
-#### Implemented features
-
-- [x] Convolution w/ kernel shape 1x1
-- [x] Convolution w/ kernel shape 3x3
-- [x] Depthwise convolution w/ kernel shape 3x3
-- [x] Stride 1x1
-- [x] Stride 2x2
-- [ ] Normalization and quantization
-    - [x] With
-    - [ ] Without
-    - [x] Relu (w/ and w/o)
-    - [x] Bias (w/ and w/o)
-    - [ ] Per-channel shift
-    - [x] Per-layer shift
-    - [ ] Rounding
-- [ ] Input type
-    - [x] uint8
-    - [ ] uint16
-- [ ] Output type
-    - [x] int8
-    - [x] uint8 (only w/ Relu)
-    - [ ] int32
-    - [ ] uint32 (only w/ Relu)
-- [ ] Scale type
-    - [x] uint8
-    - [ ] uint16
-    - [ ] uint32
-- [x] Bias type
-    - [x] int32
-- [ ] Weight type
-    - [x] int8
-    - [ ] int2-7
-
-### Neureka
-
-**Untested and considered broken.**
+- [NE16](ne16/README.md)
+- [Neureka](neureka/README.md)
 
 ## Testing
 
diff --git a/ne16/README.md b/ne16/README.md
new file mode 100644
index 0000000..2876b0a
--- /dev/null
+++ b/ne16/README.md
@@ -0,0 +1,37 @@
+# NE16
+
+## Docs
+
+- Github repo [link](https://github.com/pulp-platform/ne16).
+
+## Implemented features
+
+- [x] Convolution w/ kernel shape 1x1
+- [x] Convolution w/ kernel shape 3x3
+- [x] Depthwise convolution w/ kernel shape 3x3
+- [x] Stride 2x2
+- [ ] Normalization and quantization
+    - [x] With
+    - [ ] Without
+    - [x] Relu (w/ and w/o)
+    - [x] Bias (w/ and w/o)
+    - [ ] Per-channel shift
+    - [x] Per-layer shift
+    - [ ] Rounding
+- [ ] Input type
+    - [x] uint8
+    - [ ] uint16
+- [ ] Output type
+    - [x] int8
+    - [x] uint8 (only w/ Relu)
+    - [ ] int32
+    - [ ] uint32 (only w/ Relu)
+- [ ] Scale type
+    - [x] uint8
+    - [ ] uint16
+    - [ ] uint32
+- [x] Bias type
+    - [x] int32
+- [ ] Weight type
+    - [x] int8
+    - [ ] int2-7
diff --git a/neureka/README.md b/neureka/README.md
new file mode 100644
index 0000000..a5043f0
--- /dev/null
+++ b/neureka/README.md
@@ -0,0 +1,35 @@
+# NE16
+
+## Docs
+
+Github repo [link](https://github.com/siracusa-soc/ne).
+
+## Implemented features
+
+- [x] Convolution w/ kernel shape 1x1
+- [x] Convolution w/ kernel shape 3x3
+- [x] Depthwise convolution w/ kernel shape 3x3
+- [ ] Normalization and quantization
+    - [x] With
+    - [ ] Without
+    - [x] Relu (w/ and w/o)
+    - [x] Bias (w/ and w/o)
+    - [ ] Per-channel shift
+    - [x] Per-layer shift
+    - [ ] Rounding
+- [ ] Input type
+    - [x] uint8
+    - [x] int8
+- [ ] Output type
+    - [x] int8
+    - [x] uint8 (only w/ Relu)
+    - [ ] int32
+    - [ ] uint32 (only w/ Relu)
+- [ ] Scale type
+    - [x] uint8
+    - [ ] uint32
+- [x] Bias type
+    - [x] int32
+- [ ] Weight type
+    - [x] int8
+    - [ ] int2-7

From 2947229d55900d557118efda991879bcb09fa275 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 19 Jan 2024 18:56:31 +0100
Subject: [PATCH 28/72] Remove ne16 input dim 2 stride calculation

---
 ne16/hal/ne16_task.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/ne16/hal/ne16_task.c b/ne16/hal/ne16_task.c
index b0a4337..7e405f9 100644
--- a/ne16/hal/ne16_task.c
+++ b/ne16/hal/ne16_task.c
@@ -114,11 +114,7 @@ void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in,
   const uint32_t num_k_in = divnceil(k_in, NE16_INPUT_CHANNEL_THROUGHPUT);
 
   const ne16_stride_t input_stride = {
-      .d0 = k_in_stride,
-      .d1 = k_in_stride * w_in_stride,
-      .d2 = task->depthwise ? 0
-                            : k_in_stride * NE16_FILTER_BUFFER_SIZE *
-                                  NE16_FILTER_BUFFER_SIZE};
+      .d0 = k_in_stride, .d1 = k_in_stride * w_in_stride, .d2 = 0};
   task->data.cfg.input_stride = input_stride;
 
   // WARNING: Stride works only for even output channel sizes (divisible by 2)
@@ -133,20 +129,18 @@ void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in,
     task->data.cfg.weights_stride.d0 = task->weight_d0_stride * task->qw;
     task->data.cfg.weights_stride.d1 =
         task->weight_d0_stride * task->qw * num_k_in;
-    task->data.cfg.weights_stride.d2 = 0;
   } else if (!task->depthwise) {
     task->data.cfg.weights_stride.d0 =
         NE16_FILTER_SIZE * NE16_FILTER_SIZE * task->weight_d0_stride;
     task->data.cfg.weights_stride.d1 = NE16_FILTER_SIZE * NE16_FILTER_SIZE *
                                        task->weight_d0_stride * task->qw *
                                        num_k_in;
-    task->data.cfg.weights_stride.d2 = 0;
   } else {
     task->data.cfg.weights_stride.d0 =
         NE16_FILTER_SIZE * NE16_FILTER_SIZE * task->weight_d0_stride;
     task->data.cfg.weights_stride.d1 = 0;
-    task->data.cfg.weights_stride.d2 = 0;
   }
+  task->data.cfg.weights_stride.d2 = 0;
 }
 
 void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in,

From b7442eed5e660f21e4f1ae3379f96d924afe8b42 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 19 Jan 2024 19:12:36 +0100
Subject: [PATCH 29/72] Move common validators to NnxTest

---
 test/Ne16TestConf.py    | 40 +++------------------------------------
 test/NeurekaTestConf.py | 40 +++------------------------------------
 test/NnxTestClasses.py  | 42 +++++++++++++++++++++++++++++++++++++++--
 test/TestClasses.py     |  4 ++++
 4 files changed, 50 insertions(+), 76 deletions(-)

diff --git a/test/Ne16TestConf.py b/test/Ne16TestConf.py
index 889a1fe..7e4bd01 100644
--- a/test/Ne16TestConf.py
+++ b/test/Ne16TestConf.py
@@ -20,7 +20,7 @@
 from typing import List, Union, Optional
 from Ne16 import Ne16
 from NnxTestClasses import NnxTestConf
-from TestClasses import implies, KernelShape, Padding, Stride, IntegerType
+from TestClasses import implies, KernelShape, Stride, IntegerType
 from pydantic import field_validator, model_validator
 
 
@@ -89,52 +89,18 @@ def check_valid_out_channel_with_stride_2x2(self) -> Ne16TestConf:
         return self
 
     @model_validator(mode="after")  # type: ignore
-    def check_valid_depthwise(self) -> Ne16TestConf:
+    def check_valid_depthwise_kernel_shape(self) -> Ne16TestConf:
         assert implies(
             self.depthwise, self.kernel_shape == KernelShape(height=3, width=3)
         ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}."
-        assert implies(self.depthwise, self.in_channel == self.out_channel), (
-            f"Input and output channel should be the same in a depthwise layer. "
-            f"input channel: {self.in_channel}, output channel: {self.out_channel}"
-        )
-        return self
-
-    @model_validator(mode="after")  # type: ignore
-    def check_valid_padding_with_kernel_shape_1x1(self) -> Ne16TestConf:
-        assert implies(
-            self.kernel_shape == KernelShape(height=1, width=1),
-            self.padding == Padding(top=0, bottom=0, left=0, right=0),
-        ), f"No padding on 1x1 kernel. Given padding {self.padding}"
-        return self
-
-    @field_validator("has_norm_quant")
-    @classmethod
-    def check_valid_has_norm_quant(cls, v: bool) -> bool:
-        assert v == True, f"Untested without has_norm_quant."
-        return v
-
-    @model_validator(mode="after")  # type: ignore
-    def check_valid_norm_quant_types_when_has_norm_qunat(self) -> Ne16TestConf:
-        if self.has_norm_quant:
-            assert self.scale_type is not None, "Scale type was not provided."
-            if self.has_bias:
-                assert self.bias_type is not None, "Bias type was not provided."
         return self
 
     @model_validator(mode="after")  # type: ignore
-    def check_valid_out_type_with_flags(self) -> Ne16TestConf:
+    def check_valid_out_type_with_norm_quant(self) -> Ne16TestConf:
         assert implies(
             not self.has_norm_quant, self.out_type == Ne16.ACCUMULATOR_TYPE
         ), (
             f"Without quantization, the output type has to be equal to the "
             f"accumulator type {Ne16.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
         )
-        assert implies(
-            self.has_norm_quant,
-            (self.has_relu and not self.out_type._signed)
-            or (not self.has_relu and self.out_type._signed),
-        ), (
-            f"Output type has to be unsigned when there is relu, otherwise signed. "
-            f"Given output type {self.out_type} and has_relu {self.has_relu}"
-        )
         return self
diff --git a/test/NeurekaTestConf.py b/test/NeurekaTestConf.py
index 8fe8e68..041b8bf 100644
--- a/test/NeurekaTestConf.py
+++ b/test/NeurekaTestConf.py
@@ -20,7 +20,7 @@
 from Neureka import Neureka
 from typing import List, Union, Optional
 from NnxTestClasses import NnxTestConf
-from TestClasses import implies, KernelShape, Padding, Stride, IntegerType
+from TestClasses import implies, KernelShape, Stride, IntegerType
 from pydantic import field_validator, model_validator
 
 
@@ -80,52 +80,18 @@ def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType
         return v
 
     @model_validator(mode="after")  # type: ignore
-    def check_valid_depthwise(self) -> NeurekaTestConf:
+    def check_valid_depthwise_kernel_shape(self) -> NeurekaTestConf:
         assert implies(
             self.depthwise, self.kernel_shape == KernelShape(height=3, width=3)
         ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}."
-        assert implies(self.depthwise, self.in_channel == self.out_channel), (
-            f"Input and output channel should be the same in a depthwise layer. "
-            f"input channel: {self.in_channel}, output channel: {self.out_channel}"
-        )
-        return self
-
-    @model_validator(mode="after")  # type: ignore
-    def check_valid_padding_with_kernel_shape_1x1(self) -> NeurekaTestConf:
-        assert implies(
-            self.kernel_shape == KernelShape(height=1, width=1),
-            self.padding == Padding(top=0, bottom=0, left=0, right=0),
-        ), f"No padding on 1x1 kernel. Given padding {self.padding}"
-        return self
-
-    @field_validator("has_norm_quant")
-    @classmethod
-    def check_valid_has_norm_quant(cls, v: bool) -> bool:
-        assert v == True, f"Untested without has_norm_quant."
-        return v
-
-    @model_validator(mode="after")  # type: ignore
-    def check_valid_norm_quant_types_when_has_norm_qunat(self) -> NeurekaTestConf:
-        if self.has_norm_quant:
-            assert self.scale_type is not None, "Scale type was not provided."
-            if self.has_bias:
-                assert self.bias_type is not None, "Bias type was not provided."
         return self
 
     @model_validator(mode="after")  # type: ignore
-    def check_valid_out_type_with_flags(self) -> NeurekaTestConf:
+    def check_valid_out_type_with_norm_quant(self) -> NeurekaTestConf:
         assert implies(
             not self.has_norm_quant, self.out_type == Neureka.ACCUMULATOR_TYPE
         ), (
             f"Without quantization, the output type has to be equal to the "
             f"accumulator type {Neureka.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
         )
-        assert implies(
-            self.has_norm_quant,
-            (self.has_relu and not self.out_type._signed)
-            or (not self.has_relu and self.out_type._signed),
-        ), (
-            f"Output type has to be unsigned when there is relu, otherwise signed. "
-            f"Given output type {self.out_type} and has_relu {self.has_relu}"
-        )
         return self
diff --git a/test/NnxTestClasses.py b/test/NnxTestClasses.py
index 57c4f23..495d556 100644
--- a/test/NnxTestClasses.py
+++ b/test/NnxTestClasses.py
@@ -24,8 +24,8 @@
 import torch.nn.functional as F
 import os
 from HeaderWriter import HeaderWriter
-from TestClasses import IntegerType, Stride, Padding, KernelShape, implies
-from pydantic import BaseModel, PositiveInt
+from TestClasses import IntegerType, Stride, Padding, KernelShape, implies, xor
+from pydantic import BaseModel, PositiveInt, field_validator, model_validator
 
 
 class NnxTestConf(BaseModel):
@@ -46,6 +46,44 @@ class NnxTestConf(BaseModel):
     has_bias: bool
     has_relu: bool
 
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_depthwise_channels(self) -> NnxTestConf:
+        assert implies(self.depthwise, self.in_channel == self.out_channel), (
+            f"Input and output channel should be the same in a depthwise layer. "
+            f"input channel: {self.in_channel}, output channel: {self.out_channel}"
+        )
+        return self
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_padding_with_kernel_shape_1x1(self) -> NnxTestConf:
+        assert implies(
+            self.kernel_shape == KernelShape(height=1, width=1),
+            self.padding == Padding(top=0, bottom=0, left=0, right=0),
+        ), f"No padding on 1x1 kernel. Given padding {self.padding}"
+        return self
+
+    @field_validator("has_norm_quant")
+    @classmethod
+    def check_valid_has_norm_quant(cls, v: bool) -> bool:
+        assert v == True, f"Untested without has_norm_quant."
+        return v
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_norm_quant_types_when_has_norm_qunat(self) -> NnxTestConf:
+        if self.has_norm_quant:
+            assert self.scale_type is not None, "Scale type was not provided."
+            if self.has_bias:
+                assert self.bias_type is not None, "Bias type was not provided."
+        return self
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_out_type_with_relu(self) -> NnxTestConf:
+        assert xor(self.has_relu, not self.out_type._signed), (
+            f"Output type has to be unsigned when there is relu, otherwise signed. "
+            f"Given output type {self.out_type} and has_relu {self.has_relu}"
+        )
+        return self
+
 
 class NnxTest:
     _CONF_NAME = "conf.json"
diff --git a/test/TestClasses.py b/test/TestClasses.py
index c10641c..450ba21 100644
--- a/test/TestClasses.py
+++ b/test/TestClasses.py
@@ -32,6 +32,10 @@ def implies(a: bool, b: bool):
     return (not a) or b
 
 
+def xor(a: bool, b: bool):
+    return (a and not b) or (not a and b)
+
+
 class KernelShape(BaseModel):
     height: PositiveInt
     width: PositiveInt

From 91effde65954f0616c618c1188a8a4ec2fd20325 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 22 Jan 2024 15:01:23 +0100
Subject: [PATCH 30/72] Fix xored relu with unsigned instead of signed

---
 test/NnxTestClasses.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/NnxTestClasses.py b/test/NnxTestClasses.py
index 495d556..c13ebb3 100644
--- a/test/NnxTestClasses.py
+++ b/test/NnxTestClasses.py
@@ -78,7 +78,7 @@ def check_valid_norm_quant_types_when_has_norm_qunat(self) -> NnxTestConf:
 
     @model_validator(mode="after")  # type: ignore
     def check_valid_out_type_with_relu(self) -> NnxTestConf:
-        assert xor(self.has_relu, not self.out_type._signed), (
+        assert xor(self.has_relu, self.out_type._signed), (
             f"Output type has to be unsigned when there is relu, otherwise signed. "
             f"Given output type {self.out_type} and has_relu {self.has_relu}"
         )

From 0187793de537ed460bbbac3a84617f9535379d17 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 22 Jan 2024 15:38:20 +0100
Subject: [PATCH 31/72] Change gvsoc functions have a body only when running in
 gvsoc

---
 ne16/gvsoc/ne16_gvsoc.h       | 4 ++++
 neureka/gvsoc/neureka_gvsoc.h | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/ne16/gvsoc/ne16_gvsoc.h b/ne16/gvsoc/ne16_gvsoc.h
index f6626fd..e461416 100644
--- a/ne16/gvsoc/ne16_gvsoc.h
+++ b/ne16/gvsoc/ne16_gvsoc.h
@@ -42,13 +42,17 @@ typedef enum ne16_gvsoc_log_level_e {
 static void ne16_gvsoc_log_activate(ne16_dev_t *dev,
                                     ne16_gvsoc_log_level_e log_level,
                                     ne16_gvsoc_log_format_e format) {
+#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
   hwpe_task_reg_write(&dev->hwpe_dev, NE16_REG_GVSOC_LOG_LEVEL, log_level);
   hwpe_task_reg_write(&dev->hwpe_dev, NE16_REG_GVSOC_LOG_FORMAT, format);
+#endif
 }
 
 static void ne16_gvsoc_log_deactivate(ne16_dev_t *dev) {
+#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
   hwpe_task_reg_write(&dev->hwpe_dev, NE16_REG_GVSOC_LOG_LEVEL,
                       NE16_GVSOC_LOG_LEVEL_CONFIG);
+#endif
 }
 
 #endif // __NE16_GVSOC_H__
diff --git a/neureka/gvsoc/neureka_gvsoc.h b/neureka/gvsoc/neureka_gvsoc.h
index 37eeab0..e163036 100644
--- a/neureka/gvsoc/neureka_gvsoc.h
+++ b/neureka/gvsoc/neureka_gvsoc.h
@@ -42,13 +42,17 @@ typedef enum neureka_gvsoc_log_level_e {
 static void neureka_gvsoc_log_activate(neureka_dev_t *dev,
                                        neureka_gvsoc_log_level_e log_level,
                                        neureka_gvsoc_log_format_e format) {
+#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
   hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL, log_level);
   hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_FORMAT, format);
+#endif
 }
 
 static void neureka_gvsoc_log_deactivate(neureka_dev_t *dev) {
+#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
   hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL,
                       NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END);
+#endif
 }
 
 #endif // __NEUREKA_GVSOC_H__

From 4fab4803bdecfc0067f8d6a9df1324a38169ae9a Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 22 Jan 2024 15:50:43 +0100
Subject: [PATCH 32/72] Change remove %s from main strings

---
 test/HeaderWriter.py |  5 +++--
 test/app/src/main.c  | 10 +++-------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/test/HeaderWriter.py b/test/HeaderWriter.py
index 5fd0968..e8adb0b 100644
--- a/test/HeaderWriter.py
+++ b/test/HeaderWriter.py
@@ -48,8 +48,9 @@ def define(self, name, expr):
         if isinstance(expr, str):
             expr = f'"{expr}"'
         elif isinstance(expr, bool):
-            expr = int(expr)
-        expr = f"({expr})"
+            expr = f'({int(expr)})'
+        else:
+            expr = f"({expr})"
         return f"#define {name.upper()} {expr}\n"
 
     def vector_size(self, data):
diff --git a/test/app/src/main.c b/test/app/src/main.c
index a2e7fd5..08d27a1 100644
--- a/test/app/src/main.c
+++ b/test/app/src/main.c
@@ -20,22 +20,18 @@
 
 #include <pmsis.h>
 
-#include "bias.h"
-#include "input.h"
 #include "layer_util.h"
 #include "nnx_layer.h"
 #include "output.h"
-#include "scale.h"
-#include "weight.h"
 
 int main() {
   struct pi_device cl_dev;
   struct pi_cluster_conf cl_conf;
   struct pi_cluster_task cl_task;
 
-  printf("\nTest %s starting\n", TEST_NAME);
+  printf("\nTest " TEST_NAME " starting\n");
 
-  printf("\nAccelerator: %s\n", NNX_ACCELERATOR);
+  printf("\nAccelerator: " NNX_ACCELERATOR "\n");
 
   printf("\n");
   layer_info();
@@ -50,7 +46,7 @@ int main() {
       &cl_dev, pi_cluster_task(&cl_task, execute_nnx_layer, NULL));
   pi_cluster_close(&cl_dev);
 
-  printf("\nTest %s finished\n", TEST_NAME);
+  printf("\nTest " TEST_NAME " finished\n");
 
   printf("\n");
   check_output();

From 3906e7ae5d4d1e719e51a46b5c84b598df291654 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 22 Jan 2024 15:55:02 +0100
Subject: [PATCH 33/72] Fix output checking should be done before cluster close
 since data is in L1

---
 test/app/src/main.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/app/src/main.c b/test/app/src/main.c
index 08d27a1..7cce4bf 100644
--- a/test/app/src/main.c
+++ b/test/app/src/main.c
@@ -44,12 +44,13 @@ int main() {
   }
   pi_cluster_send_task_to_cl(
       &cl_dev, pi_cluster_task(&cl_task, execute_nnx_layer, NULL));
-  pi_cluster_close(&cl_dev);
-
-  printf("\nTest " TEST_NAME " finished\n");
 
   printf("\n");
   check_output();
 
+  pi_cluster_close(&cl_dev);
+
+  printf("\nTest " TEST_NAME " finished\n");
+
   return 0;
 }

From 774cfd390773c728919c11dedff46a1cabceac4b Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 22 Jan 2024 15:57:03 +0100
Subject: [PATCH 34/72] Remove L2 copies of tensors

---
 test/HeaderWriter.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/HeaderWriter.py b/test/HeaderWriter.py
index e8adb0b..720db15 100644
--- a/test/HeaderWriter.py
+++ b/test/HeaderWriter.py
@@ -136,7 +136,6 @@ def generate_vector_header(self, name, size, _type, init=None, golden=None):
         render = ""
         render += self.includes
         render += self.render_vector(name, "extern " + _type, size)
-        render += self.render_vector(name + "_l2", "extern " + _type, size)
 
         if golden is not None:
             render += self.render_vector("golden_" + name, "extern " + _type, size)
@@ -157,7 +156,6 @@ def generate_vector_source(self, name, size, _type, init=None, golden=None):
         render = ""
         render += f'#include "{name}.h"\n\n'
         render += self.render_vector(name, "PI_L1 " + _type, size, init=init)
-        render += self.render_vector(name + "_l2", "PI_L2 " + _type, size, init=init)
 
         if golden is not None:
             render += self.render_vector(

From 4358ed52cb38a96de696c5e41219fe27df90283f Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 22 Jan 2024 15:57:37 +0100
Subject: [PATCH 35/72] Replace pointer arithmetic with array indexing

---
 util/hwpe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/util/hwpe.c b/util/hwpe.c
index 53c1ace..0430081 100644
--- a/util/hwpe.c
+++ b/util/hwpe.c
@@ -31,11 +31,11 @@
 #define HWPE_TASK_REG_OFFSET 8
 
 inline void hwpe_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) {
-  *(dev->base_addr + reg) = value;
+  dev->base_addr[reg] = value;
 }
 
 inline uint32_t hwpe_reg_read(hwpe_dev_t *dev, int reg) {
-  return *(dev->base_addr + reg);
+  return dev->base_addr[reg];
 }
 
 inline void hwpe_task_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) {

From 668c0457e1c80630d8e96fbdc53f92ef18817482 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 22 Jan 2024 16:25:43 +0100
Subject: [PATCH 36/72] Fix formatting

---
 test/HeaderWriter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/HeaderWriter.py b/test/HeaderWriter.py
index 720db15..07dc597 100644
--- a/test/HeaderWriter.py
+++ b/test/HeaderWriter.py
@@ -48,7 +48,7 @@ def define(self, name, expr):
         if isinstance(expr, str):
             expr = f'"{expr}"'
         elif isinstance(expr, bool):
-            expr = f'({int(expr)})'
+            expr = f"({int(expr)})"
         else:
             expr = f"({expr})"
         return f"#define {name.upper()} {expr}\n"

From cf70c78e7a44b56811ad33d5b6bef7071ee32eed Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 22 Jan 2024 16:29:06 +0100
Subject: [PATCH 37/72] Fix Neureka name in readme

---
 neureka/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neureka/README.md b/neureka/README.md
index a5043f0..bca128a 100644
--- a/neureka/README.md
+++ b/neureka/README.md
@@ -1,4 +1,4 @@
-# NE16
+# Neureka
 
 ## Docs
 

From b02e01879df518821c7890a00e45be06a43a794f Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Wed, 24 Jan 2024 18:15:10 +0100
Subject: [PATCH 38/72] Align names with Neureka.py

---
 test/Ne16.py | 61 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 34 insertions(+), 27 deletions(-)

diff --git a/test/Ne16.py b/test/Ne16.py
index d6abaf2..5d2c681 100644
--- a/test/Ne16.py
+++ b/test/Ne16.py
@@ -32,63 +32,70 @@ def weight_unroll(
     ) -> npt.NDArray[np.uint8]:
         """Unroll weight into expected memory format
 
-        Expected weight shape is (Cout, Cin, H, W).
-        The output shape is: (Cout, Cin_major, Bits, H x W, Cin_minor_bytes),
-        where Cin_major is the ceil(Cin / CIN_SUBTILE) and Cin_minor has to be padded with 0 to CIN_SUBTILE.
+        Expected weight shape is (cout, cin, height, width).
+        The output shape is: (cout, cinMajor, Bits, height x width, cinMinorBytes),
+        where cinMajor is the ceil(cin / CIN_SUBTILE) and cinMinor has to be padded with 0 to CIN_SUBTILE.
         """
         if depthwise:
-            weight = weight.transpose(1, 0, 2, 3)  # Swap Cout and Cin
+            weight = weight.transpose(1, 0, 2, 3)  # Swap cout and cin
 
-        Cout, Cin, H, W = weight.shape
+        cout, cin, height, width = weight.shape
 
-        # Pad Cin to be divisible with CIN_SUBTILE
-        if Cin % Ne16._CIN_SUBTILE != 0:
-            Cin_pad = Ne16._CIN_SUBTILE - Cin % Ne16._CIN_SUBTILE
+        # Pad cin to be divisible with CIN_SUBTILE
+        if cin % Ne16._CIN_SUBTILE != 0:
+            cin_pad = Ne16._CIN_SUBTILE - cin % Ne16._CIN_SUBTILE
             weight = np.pad(
                 weight,
-                ((0, 0), (0, Cin_pad), (0, 0), (0, 0)),
+                ((0, 0), (0, cin_pad), (0, 0), (0, 0)),
                 "constant",
                 constant_values=0,
             )
 
-        # Reshape into (Cout, Cin_major, Cin_minor, Flattened spatial, 1)
+        # Reshape into (cout, cinMajor, cinMinor, flattened spatial, 1)
         # The 1 at the end is required by the unpacking
-        Cin_major = int(np.ceil(Cin / Ne16._CIN_SUBTILE))
-        Cin_minor = Ne16._CIN_SUBTILE
-        weight = weight.reshape(Cout, Cin_major, Cin_minor, H * W, 1)
+        cinMajor = int(np.ceil(cin / Ne16._CIN_SUBTILE))
+        cinMinor = Ne16._CIN_SUBTILE
+        weight = weight.reshape(cout, cinMajor, cinMinor, height * width, 1)
 
         # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0]
-        # (Cout, Cin_major, Cin_minor, Flattened spatial, Bits)
+        # (cout, cinMajor, cinMinor, flattened spatial, Bits)
         weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little")
 
         # Shuffle bits so that the final shape is:
-        # (Cout, Cin_major, Bits, Flattened spatial, Cin_minor)
+        # (cout, cinMajor, Bits, flattened spatial, cinMinor)
         weight = weight.transpose(0, 1, 4, 3, 2)
 
         # Prepare for packing
-        # (Cout, Cin_major, Bits, Flattened spatial, Cin_minor_bytes, 8)
-        Cin_minor_bytes = int(np.ceil(Cin_minor / 8))
-        weight = np.stack(np.split(weight, Cin_minor_bytes, axis=-1), axis=-2)
+        # (cout, cinMajor, Bits, flattened spatial, cinMinorBytes, 8)
+        cinMinorBytes = int(np.ceil(cinMinor / 8))
+        weight = np.stack(np.split(weight, cinMinorBytes, axis=-1), axis=-2)
 
         # Pack
-        # (Cout, Cin_major, Bits, Flattened spatial, Cin_minor_bytes)
+        # (cout, cinMajor, Bits, flattened spatial, cinMinorBytes)
         weight = np.packbits(weight, axis=-1, bitorder="little")
 
         return weight.flatten()
 
     @staticmethod
-    def weight_roll(weight: np.ndarray, bits: int, Cout: int, Cin: int, H: int, W: int):
+    def weight_roll(
+        weight: npt.NDArray[np.uint8],
+        bits: int,
+        cout: int,
+        cin: int,
+        height: int,
+        width: int,
+    ) -> npt.NDArray[np.uint8]:
         """Reverse of weight_roll"""
-        Cin_major = int(np.ceil(Cin / Ne16._CIN_SUBTILE))
-        Cin_minor = Ne16._CIN_SUBTILE
-        Cin_minor_bytes = int(np.ceil(Cin_minor / 8))
+        cinMajor = int(np.ceil(cin / Ne16._CIN_SUBTILE))
+        cinMinor = Ne16._CIN_SUBTILE
+        cinMinorBytes = int(np.ceil(cinMinor / 8))
 
-        weight = weight.reshape(Cout, Cin_major, bits, H * W, Cin_minor_bytes, 1)
+        weight = weight.reshape(cout, cinMajor, bits, height * width, cinMinorBytes, 1)
         weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little")
-        weight = weight.reshape(Cout, Cin_major, bits, H * W, Cin_minor)
+        weight = weight.reshape(cout, cinMajor, bits, height * width, cinMinor)
         weight = weight.transpose(0, 1, 4, 3, 2)
         weight = np.packbits(weight, axis=-1, bitorder="little")
-        weight = weight.reshape(Cout, Cin_major * Cin_minor, H, W)
-        weight = weight[:, :Cin, :, :]
+        weight = weight.reshape(cout, cinMajor * cinMinor, height, width)
+        weight = weight[:, :cin, :, :]
 
         return weight

From 5e61e39212ef10fbd138593351f57a7363d99d06 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Wed, 24 Jan 2024 18:11:51 +0100
Subject: [PATCH 39/72] Pad cin

---
 test/Ne16.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/Ne16.py b/test/Ne16.py
index 5d2c681..1601b7c 100644
--- a/test/Ne16.py
+++ b/test/Ne16.py
@@ -25,6 +25,7 @@ class Ne16:
     ACCUMULATOR_TYPE = IntegerType(name="int32")
 
     _CIN_SUBTILE = 16
+    _LINEAR_CIN_TILE = 256
 
     @staticmethod
     def weight_unroll(
@@ -43,17 +44,18 @@ def weight_unroll(
 
         # Pad cin to be divisible with CIN_SUBTILE
         if cin % Ne16._CIN_SUBTILE != 0:
-            cin_pad = Ne16._CIN_SUBTILE - cin % Ne16._CIN_SUBTILE
+            cinPad = Ne16._CIN_SUBTILE - cin % Ne16._CIN_SUBTILE
             weight = np.pad(
                 weight,
-                ((0, 0), (0, cin_pad), (0, 0), (0, 0)),
+                ((0, 0), (0, cinPad), (0, 0), (0, 0)),
                 "constant",
                 constant_values=0,
             )
+            cin = cin + cinPad
 
         # Reshape into (cout, cinMajor, cinMinor, flattened spatial, 1)
         # The 1 at the end is required by the unpacking
-        cinMajor = int(np.ceil(cin / Ne16._CIN_SUBTILE))
+        cinMajor = cin // Ne16._CIN_SUBTILE
         cinMinor = Ne16._CIN_SUBTILE
         weight = weight.reshape(cout, cinMajor, cinMinor, height * width, 1)
 

From 8345b1d224def07399aca3b9bbd8202ae6322bce Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 25 Jan 2024 12:35:10 +0100
Subject: [PATCH 40/72] Rename <acc>.py to <acc>MemoryLayout.py

---
 test/{Ne16.py => Ne16MemoryLayout.py}       | 19 ++++++------
 test/Ne16TestConf.py                        |  6 ++--
 test/{Neureka.py => NeurekaMemoryLayout.py} | 22 ++++++-------
 test/NeurekaTestConf.py                     |  6 ++--
 test/conftest.py                            | 12 ++++----
 test/test.py                                | 10 +++---
 test/testgen.py                             | 34 +++++++++++++--------
 7 files changed, 59 insertions(+), 50 deletions(-)
 rename test/{Ne16.py => Ne16MemoryLayout.py} (88%)
 rename test/{Neureka.py => NeurekaMemoryLayout.py} (88%)

diff --git a/test/Ne16.py b/test/Ne16MemoryLayout.py
similarity index 88%
rename from test/Ne16.py
rename to test/Ne16MemoryLayout.py
index 1601b7c..2b58a7a 100644
--- a/test/Ne16.py
+++ b/test/Ne16MemoryLayout.py
@@ -21,14 +21,13 @@
 from TestClasses import IntegerType
 
 
-class Ne16:
+class Ne16MemoryLayout:
     ACCUMULATOR_TYPE = IntegerType(name="int32")
 
     _CIN_SUBTILE = 16
-    _LINEAR_CIN_TILE = 256
 
     @staticmethod
-    def weight_unroll(
+    def weightEncode(
         weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False
     ) -> npt.NDArray[np.uint8]:
         """Unroll weight into expected memory format
@@ -43,8 +42,8 @@ def weight_unroll(
         cout, cin, height, width = weight.shape
 
         # Pad cin to be divisible with CIN_SUBTILE
-        if cin % Ne16._CIN_SUBTILE != 0:
-            cinPad = Ne16._CIN_SUBTILE - cin % Ne16._CIN_SUBTILE
+        if cin % Ne16MemoryLayout._CIN_SUBTILE != 0:
+            cinPad = Ne16MemoryLayout._CIN_SUBTILE - cin % Ne16MemoryLayout._CIN_SUBTILE
             weight = np.pad(
                 weight,
                 ((0, 0), (0, cinPad), (0, 0), (0, 0)),
@@ -55,8 +54,8 @@ def weight_unroll(
 
         # Reshape into (cout, cinMajor, cinMinor, flattened spatial, 1)
         # The 1 at the end is required by the unpacking
-        cinMajor = cin // Ne16._CIN_SUBTILE
-        cinMinor = Ne16._CIN_SUBTILE
+        cinMajor = cin // Ne16MemoryLayout._CIN_SUBTILE
+        cinMinor = Ne16MemoryLayout._CIN_SUBTILE
         weight = weight.reshape(cout, cinMajor, cinMinor, height * width, 1)
 
         # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0]
@@ -79,7 +78,7 @@ def weight_unroll(
         return weight.flatten()
 
     @staticmethod
-    def weight_roll(
+    def weightDecode(
         weight: npt.NDArray[np.uint8],
         bits: int,
         cout: int,
@@ -88,8 +87,8 @@ def weight_roll(
         width: int,
     ) -> npt.NDArray[np.uint8]:
         """Reverse of weight_roll"""
-        cinMajor = int(np.ceil(cin / Ne16._CIN_SUBTILE))
-        cinMinor = Ne16._CIN_SUBTILE
+        cinMajor = int(np.ceil(cin / Ne16MemoryLayout._CIN_SUBTILE))
+        cinMinor = Ne16MemoryLayout._CIN_SUBTILE
         cinMinorBytes = int(np.ceil(cinMinor / 8))
 
         weight = weight.reshape(cout, cinMajor, bits, height * width, cinMinorBytes, 1)
diff --git a/test/Ne16TestConf.py b/test/Ne16TestConf.py
index 7e4bd01..74479d2 100644
--- a/test/Ne16TestConf.py
+++ b/test/Ne16TestConf.py
@@ -18,7 +18,7 @@
 
 from __future__ import annotations
 from typing import List, Union, Optional
-from Ne16 import Ne16
+from Ne16MemoryLayout import Ne16MemoryLayout
 from NnxTestClasses import NnxTestConf
 from TestClasses import implies, KernelShape, Stride, IntegerType
 from pydantic import field_validator, model_validator
@@ -98,9 +98,9 @@ def check_valid_depthwise_kernel_shape(self) -> Ne16TestConf:
     @model_validator(mode="after")  # type: ignore
     def check_valid_out_type_with_norm_quant(self) -> Ne16TestConf:
         assert implies(
-            not self.has_norm_quant, self.out_type == Ne16.ACCUMULATOR_TYPE
+            not self.has_norm_quant, self.out_type == Ne16MemoryLayout.ACCUMULATOR_TYPE
         ), (
             f"Without quantization, the output type has to be equal to the "
-            f"accumulator type {Ne16.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
+            f"accumulator type {Ne16MemoryLayout.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
         )
         return self
diff --git a/test/Neureka.py b/test/NeurekaMemoryLayout.py
similarity index 88%
rename from test/Neureka.py
rename to test/NeurekaMemoryLayout.py
index d844234..1fe59f2 100644
--- a/test/Neureka.py
+++ b/test/NeurekaMemoryLayout.py
@@ -22,7 +22,7 @@
 from TestClasses import IntegerType
 
 
-class Neureka:
+class NeurekaMemoryLayout:
     ACCUMULATOR_TYPE = IntegerType(name="int32")
 
     _WEIGHT_BANDWIDTH = 256
@@ -30,7 +30,7 @@ class Neureka:
     _CIN_SUBTILE_3x3 = 28
 
     @staticmethod
-    def weight_unroll(
+    def weightEncode(
         weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False
     ) -> npt.NDArray[np.uint8]:
         """Unroll weight into expected memory format
@@ -46,7 +46,7 @@ def weight_unroll(
 
         cout, cin, height, width = weight.shape
         cinSubtile = (
-            Neureka._CIN_SUBTILE_3x3 if height == 3 else Neureka._CIN_SUBTILE_1x1
+            NeurekaMemoryLayout._CIN_SUBTILE_3x3 if height == 3 else NeurekaMemoryLayout._CIN_SUBTILE_1x1
         )
 
         # Pad cin to be divisible with CIN_SUBTILE
@@ -80,7 +80,7 @@ def weight_unroll(
             # (-1, Weight Bandwidth)
             weight = np.pad(
                 weight,
-                ((0, 0), (0, Neureka._WEIGHT_BANDWIDTH - weight.shape[-1])),
+                ((0, 0), (0, NeurekaMemoryLayout._WEIGHT_BANDWIDTH - weight.shape[-1])),
                 "constant",
                 constant_values=0,
             )
@@ -103,12 +103,12 @@ def weight_unroll(
             weight = weight.transpose(0, 1, 3, 4, 2, 5)
             # (-1, Weight Bandwidth)
             weight = weight.reshape(
-                cout * cinMajor, Neureka._WEIGHT_BANDWIDTH
+                cout * cinMajor, NeurekaMemoryLayout._WEIGHT_BANDWIDTH
             )  # cout*cinMajor, 256b
 
         # Prepare for packing
         # (-1, Weight Bandwidth Bytes, 8)
-        weightBandwidthBytes = int(np.ceil(Neureka._WEIGHT_BANDWIDTH / 8))
+        weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayout._WEIGHT_BANDWIDTH / 8))
         weight = np.stack(np.split(weight, weightBandwidthBytes, axis=-1), axis=-2)
 
         # Pack bits
@@ -118,7 +118,7 @@ def weight_unroll(
         return weight.flatten()
 
     @staticmethod
-    def weight_roll(
+    def weightDecode(
         weight: npt.NDArray[np.uint8],
         bits: int,
         cout: int,
@@ -126,17 +126,17 @@ def weight_roll(
         height: int,
         width: int,
     ) -> npt.NDArray[np.uint8]:
-        """Reverse of weight_unroll"""
+        """Reverse of weightEncode"""
         cinSubtile = (
-            Neureka._CIN_SUBTILE_3x3 if height == 3 else Neureka._CIN_SUBTILE_1x1
+            NeurekaMemoryLayout._CIN_SUBTILE_3x3 if height == 3 else NeurekaMemoryLayout._CIN_SUBTILE_1x1
         )
         cinMajor = int(np.ceil(cin / cinSubtile))
         cinMinor = cinSubtile
-        weightBandwidthBytes = int(np.ceil(Neureka._WEIGHT_BANDWIDTH / 8))
+        weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayout._WEIGHT_BANDWIDTH / 8))
 
         weight = weight.reshape(-1, weightBandwidthBytes, 1)
         weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little")
-        weight = weight.reshape(-1, Neureka._WEIGHT_BANDWIDTH)
+        weight = weight.reshape(-1, NeurekaMemoryLayout._WEIGHT_BANDWIDTH)
 
         if height == 3 and width == 3:
             weight = weight[:, : height * width * cinMinor]
diff --git a/test/NeurekaTestConf.py b/test/NeurekaTestConf.py
index 041b8bf..a82fef8 100644
--- a/test/NeurekaTestConf.py
+++ b/test/NeurekaTestConf.py
@@ -17,7 +17,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
-from Neureka import Neureka
+from NeurekaMemoryLayout import NeurekaMemoryLayout
 from typing import List, Union, Optional
 from NnxTestClasses import NnxTestConf
 from TestClasses import implies, KernelShape, Stride, IntegerType
@@ -89,9 +89,9 @@ def check_valid_depthwise_kernel_shape(self) -> NeurekaTestConf:
     @model_validator(mode="after")  # type: ignore
     def check_valid_out_type_with_norm_quant(self) -> NeurekaTestConf:
         assert implies(
-            not self.has_norm_quant, self.out_type == Neureka.ACCUMULATOR_TYPE
+            not self.has_norm_quant, self.out_type == NeurekaMemoryLayout.ACCUMULATOR_TYPE
         ), (
             f"Without quantization, the output type has to be equal to the "
-            f"accumulator type {Neureka.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
+            f"accumulator type {NeurekaMemoryLayout.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
         )
         return self
diff --git a/test/conftest.py b/test/conftest.py
index 8f745e7..09a18c1 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -21,9 +21,9 @@
 
 import pytest
 import pydantic
-from Ne16 import Ne16
+from Ne16MemoryLayout import Ne16MemoryLayout
 from Ne16TestConf import Ne16TestConf
-from Neureka import Neureka
+from NeurekaMemoryLayout import NeurekaMemoryLayout
 from NeurekaTestConf import NeurekaTestConf
 from NnxTestClasses import NnxTest, NnxTestGenerator
 
@@ -82,10 +82,10 @@ def pytest_generate_tests(metafunc):
     nnxName = metafunc.config.getoption("accelerator")
 
     if nnxName == "ne16":
-        nnxCls = Ne16
+        nnxMemoryLayoutCls = Ne16MemoryLayout
         nnxTestConfCls = Ne16TestConf
     elif nnxName == "neureka":
-        nnxCls = Neureka
+        nnxMemoryLayoutCls = NeurekaMemoryLayout
         nnxTestConfCls = NeurekaTestConf
     else:
         assert (
@@ -105,7 +105,7 @@ def pytest_generate_tests(metafunc):
             test = NnxTest.load(nnxTestConfCls, test_dir)
             # (Re)generate data
             if not test.is_valid() or regenerate:
-                test = NnxTestGenerator.from_conf(test.conf, nnxCls.ACCUMULATOR_TYPE)
+                test = NnxTestGenerator.from_conf(test.conf, nnxMemoryLayoutCls.ACCUMULATOR_TYPE)
                 test.save_data(test_dir)
             nnxTestAndNames.append((test, test_dir))
         except pydantic.ValidationError as e:
@@ -122,4 +122,4 @@ def pytest_generate_tests(metafunc):
     metafunc.parametrize("nnxTestAndName", nnxTestAndNames)
     metafunc.parametrize("timeout", [timeout])
     metafunc.parametrize("nnxName", [nnxName])
-    metafunc.parametrize("nnxCls", [nnxCls])
+    metafunc.parametrize("nnxMemoryLayoutCls", [nnxMemoryLayoutCls])
diff --git a/test/test.py b/test/test.py
index 542a937..3ee7ea1 100644
--- a/test/test.py
+++ b/test/test.py
@@ -21,8 +21,8 @@
 from typing import Dict, Union, Optional, Tuple, Type
 import locale
 import subprocess
-from Ne16 import Ne16
-from Neureka import Neureka
+from Ne16MemoryLayout import Ne16MemoryLayout
+from NeurekaMemoryLayout import NeurekaMemoryLayout
 from NnxTestClasses import NnxTest, NnxTestConf, NnxTestHeaderGenerator
 from pathlib import Path
 
@@ -112,10 +112,12 @@ def test(
     nnxTestAndName: Tuple[NnxTest, str],
     timeout: int,
     nnxName: str,
-    nnxCls: Union[Type[Ne16], Type[Neureka]],
+    nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]],
 ):
     nnxTest, nnxTestName = nnxTestAndName
-    NnxTestHeaderGenerator(nnxCls.weight_unroll).generate(nnxTestName, nnxTest)
+    NnxTestHeaderGenerator(nnxMemoryLayoutCls.weightEncode).generate(
+        nnxTestName, nnxTest
+    )
 
     Path("app/src/nnx_layer.c").touch()
     cmd = f"make -C app all run platform=gvsoc"
diff --git a/test/testgen.py b/test/testgen.py
index 43b4160..899f0a1 100644
--- a/test/testgen.py
+++ b/test/testgen.py
@@ -21,9 +21,9 @@
 import json
 import toml
 from typing import Optional, Type, Union, Set
-from Ne16 import Ne16
+from Ne16MemoryLayout import Ne16MemoryLayout
 from Ne16TestConf import Ne16TestConf
-from Neureka import Neureka
+from NeurekaMemoryLayout import NeurekaMemoryLayout
 from NeurekaTestConf import NeurekaTestConf
 from NnxTestClasses import (
     NnxTest,
@@ -35,7 +35,7 @@
 
 def headers_gen(
     args,
-    nnxCls: Union[Type[Ne16], Type[Neureka]],
+    nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]],
     nnxTestConfCls: Type[NnxTestConf],
     test: Optional[NnxTest] = None,
 ):
@@ -43,8 +43,12 @@ def headers_gen(
         test = NnxTest.load(nnxTestConfCls, args.test_dir)
     assert test is not None
     if not test.is_valid():
-        test = NnxTestGenerator.from_conf(test.conf, nnxCls.ACCUMULATOR_TYPE)
-    NnxTestHeaderGenerator(nnxCls.weight_unroll).generate(args.test_dir, test)
+        test = NnxTestGenerator.from_conf(
+            test.conf, nnxMemoryLayoutCls.ACCUMULATOR_TYPE
+        )
+    NnxTestHeaderGenerator(nnxMemoryLayoutCls.weightEncode).generate(
+        args.test_dir, test
+    )
 
 
 def print_tensors(test: NnxTest):
@@ -63,7 +67,9 @@ def print_tensors(test: NnxTest):
 
 
 def test_gen(
-    args, nnxCls: Union[Type[Ne16], Type[Neureka]], nnxTestConfCls: Type[NnxTestConf]
+    args,
+    nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]],
+    nnxTestConfCls: Type[NnxTestConf],
 ):
     if args.conf.endswith(".toml"):
         test_conf_dict = toml.load(args.conf)
@@ -78,12 +84,12 @@ def test_gen(
 
     test_conf = nnxTestConfCls.model_validate(test_conf_dict)
     test = NnxTestGenerator.from_conf(
-        test_conf, nnxCls.ACCUMULATOR_TYPE, verbose=args.print_tensors
+        test_conf, nnxMemoryLayoutCls.ACCUMULATOR_TYPE, verbose=args.print_tensors
     )
     if not args.skip_save:
         test.save(args.test_dir)
     if args.headers:
-        headers_gen(args, nnxCls, nnxTestConfCls, test)
+        headers_gen(args, nnxMemoryLayoutCls, nnxTestConfCls, test)
     if args.print_tensors:
         print_tensors(test)
 
@@ -112,9 +118,11 @@ def _regen_recursive(
 
 
 def test_regen(
-    args, nnxCls: Union[Type[Ne16], Type[Neureka]], nnxTestConfCls: Type[NnxTestConf]
+    args,
+    nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]],
+    nnxTestConfCls: Type[NnxTestConf],
 ):
-    _ = nnxCls
+    _ = nnxMemoryLayoutCls
     regen_tensors = set(args.tensors + ["output"])
 
     for test_dir in args.test_dirs:
@@ -207,12 +215,12 @@ def add_common_arguments(parser: argparse.ArgumentParser):
 args = parser.parse_args()
 
 if args.accelerator == "ne16":
-    nnxCls = Ne16
+    nnxMemoryLayoutCls = Ne16MemoryLayout
     nnxTestConfCls = Ne16TestConf
 elif args.accelerator == "neureka":
-    nnxCls = Neureka
+    nnxMemoryLayoutCls = NeurekaMemoryLayout
     nnxTestConfCls = NeurekaTestConf
 else:
     assert False, f"Unsupported accelerator {args.accelerator}."
 
-args.func(args, nnxCls, nnxTestConfCls)
+args.func(args, nnxMemoryLayoutCls, nnxTestConfCls)

From a623ccf586172d8b7b9f9ce071459c77a4609733 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 25 Jan 2024 13:50:14 +0100
Subject: [PATCH 41/72] Extract functional model from test gen

---
 test/Ne16MemoryLayout.py            |   3 -
 test/Ne16TestConf.py                |   7 +-
 test/NeuralEngineFunctionalModel.py | 100 +++++++++++++++++++++++
 test/NeurekaMemoryLayout.py         |  10 ++-
 test/NeurekaTestConf.py             |   7 +-
 test/NnxTestClasses.py              | 120 ++++++++--------------------
 test/conftest.py                    |   2 +-
 test/testgen.py                     |   8 +-
 8 files changed, 150 insertions(+), 107 deletions(-)
 create mode 100644 test/NeuralEngineFunctionalModel.py

diff --git a/test/Ne16MemoryLayout.py b/test/Ne16MemoryLayout.py
index 2b58a7a..30729ab 100644
--- a/test/Ne16MemoryLayout.py
+++ b/test/Ne16MemoryLayout.py
@@ -18,12 +18,9 @@
 
 import numpy as np
 import numpy.typing as npt
-from TestClasses import IntegerType
 
 
 class Ne16MemoryLayout:
-    ACCUMULATOR_TYPE = IntegerType(name="int32")
-
     _CIN_SUBTILE = 16
 
     @staticmethod
diff --git a/test/Ne16TestConf.py b/test/Ne16TestConf.py
index 74479d2..2470421 100644
--- a/test/Ne16TestConf.py
+++ b/test/Ne16TestConf.py
@@ -18,7 +18,7 @@
 
 from __future__ import annotations
 from typing import List, Union, Optional
-from Ne16MemoryLayout import Ne16MemoryLayout
+from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel
 from NnxTestClasses import NnxTestConf
 from TestClasses import implies, KernelShape, Stride, IntegerType
 from pydantic import field_validator, model_validator
@@ -98,9 +98,10 @@ def check_valid_depthwise_kernel_shape(self) -> Ne16TestConf:
     @model_validator(mode="after")  # type: ignore
     def check_valid_out_type_with_norm_quant(self) -> Ne16TestConf:
         assert implies(
-            not self.has_norm_quant, self.out_type == Ne16MemoryLayout.ACCUMULATOR_TYPE
+            not self.has_norm_quant,
+            self.out_type == NeuralEngineFunctionalModel.ACCUMULATOR_TYPE,
         ), (
             f"Without quantization, the output type has to be equal to the "
-            f"accumulator type {Ne16MemoryLayout.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
+            f"accumulator type {NeuralEngineFunctionalModel.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
         )
         return self
diff --git a/test/NeuralEngineFunctionalModel.py b/test/NeuralEngineFunctionalModel.py
new file mode 100644
index 0000000..5378fa4
--- /dev/null
+++ b/test/NeuralEngineFunctionalModel.py
@@ -0,0 +1,100 @@
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from NnxTestClasses import NnxTestConf
+from TestClasses import IntegerType
+
+
+class NeuralEngineFunctionalModel:
+    ACCUMULATOR_TYPE = IntegerType(name="int32")
+
+    @staticmethod
+    def _cast(
+        tensor: torch.Tensor, _type: IntegerType, saturate: bool = False
+    ) -> torch.Tensor:
+        if saturate:
+            return tensor.clamp(_type.min, _type.max)
+        else:
+            return tensor & ((1 << _type._bits) - 1)
+
+    def _norm_quant(
+        self,
+        conf: NnxTestConf,
+        tensor: torch.Tensor,
+        scale: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        global_shift: torch.Tensor,
+    ) -> torch.Tensor:
+        # Scale accumulators are in 48bit, so keeping the data in 64bit
+        tensor = scale * tensor
+        assert tensor.dtype == torch.int64
+
+        if conf.has_bias:
+            assert bias is not None
+            assert conf.bias_type is not None
+            # Saturating cast to int32
+            tensor = NeuralEngineFunctionalModel._cast(
+                tensor, conf.bias_type, saturate=True
+            ).type(torch.int32)
+
+            tensor = tensor + bias
+            tensor = NeuralEngineFunctionalModel._cast(
+                tensor, conf.bias_type, saturate=False
+            ).type(torch.int32)
+
+        if conf.has_relu:
+            tensor = F.relu(tensor)
+
+        tensor = tensor >> global_shift
+
+        # Saturate into out_type
+        tensor = NeuralEngineFunctionalModel._cast(tensor, conf.out_type, saturate=True)
+
+        return tensor
+
+    def convolution(
+        self,
+        conf: NnxTestConf,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        scale: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+        global_shift: Optional[torch.Tensor] = None,
+        verbose: bool = False,
+    ) -> torch.Tensor:
+        input_padded = F.pad(
+            input,
+            (
+                conf.padding.left,
+                conf.padding.right,
+                conf.padding.top,
+                conf.padding.bottom,
+            ),
+            "constant",
+            0,
+        )
+
+        # Accumulators are 32bit non-saturating.
+        # Calculate in higher precision (int64)
+        output = F.conv2d(
+            input=input_padded,
+            weight=weight,
+            stride=(conf.stride.height, conf.stride.width),
+            groups=conf.in_channel if conf.depthwise else 1,
+        ).type(torch.int64)
+
+        # Cast to accumulator type
+        output = NeuralEngineFunctionalModel._cast(
+            output, NeuralEngineFunctionalModel.ACCUMULATOR_TYPE, saturate=False
+        ).type(torch.int32)
+
+        if verbose:
+            print("INTERMEDIATE RESULTS (pre-normalization/requant):")
+            print(output)
+
+        if conf.has_norm_quant:
+            assert scale is not None
+            assert global_shift is not None
+            output = self._norm_quant(conf, output, scale, bias, global_shift)
+
+        return output
diff --git a/test/NeurekaMemoryLayout.py b/test/NeurekaMemoryLayout.py
index 1fe59f2..52d1f53 100644
--- a/test/NeurekaMemoryLayout.py
+++ b/test/NeurekaMemoryLayout.py
@@ -23,8 +23,6 @@
 
 
 class NeurekaMemoryLayout:
-    ACCUMULATOR_TYPE = IntegerType(name="int32")
-
     _WEIGHT_BANDWIDTH = 256
     _CIN_SUBTILE_1x1 = 32
     _CIN_SUBTILE_3x3 = 28
@@ -46,7 +44,9 @@ def weightEncode(
 
         cout, cin, height, width = weight.shape
         cinSubtile = (
-            NeurekaMemoryLayout._CIN_SUBTILE_3x3 if height == 3 else NeurekaMemoryLayout._CIN_SUBTILE_1x1
+            NeurekaMemoryLayout._CIN_SUBTILE_3x3
+            if height == 3
+            else NeurekaMemoryLayout._CIN_SUBTILE_1x1
         )
 
         # Pad cin to be divisible with CIN_SUBTILE
@@ -128,7 +128,9 @@ def weightDecode(
     ) -> npt.NDArray[np.uint8]:
         """Reverse of weightEncode"""
         cinSubtile = (
-            NeurekaMemoryLayout._CIN_SUBTILE_3x3 if height == 3 else NeurekaMemoryLayout._CIN_SUBTILE_1x1
+            NeurekaMemoryLayout._CIN_SUBTILE_3x3
+            if height == 3
+            else NeurekaMemoryLayout._CIN_SUBTILE_1x1
         )
         cinMajor = int(np.ceil(cin / cinSubtile))
         cinMinor = cinSubtile
diff --git a/test/NeurekaTestConf.py b/test/NeurekaTestConf.py
index a82fef8..038e66b 100644
--- a/test/NeurekaTestConf.py
+++ b/test/NeurekaTestConf.py
@@ -17,8 +17,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
-from NeurekaMemoryLayout import NeurekaMemoryLayout
 from typing import List, Union, Optional
+from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel
 from NnxTestClasses import NnxTestConf
 from TestClasses import implies, KernelShape, Stride, IntegerType
 from pydantic import field_validator, model_validator
@@ -89,9 +89,10 @@ def check_valid_depthwise_kernel_shape(self) -> NeurekaTestConf:
     @model_validator(mode="after")  # type: ignore
     def check_valid_out_type_with_norm_quant(self) -> NeurekaTestConf:
         assert implies(
-            not self.has_norm_quant, self.out_type == NeurekaMemoryLayout.ACCUMULATOR_TYPE
+            not self.has_norm_quant,
+            self.out_type == NeuralEngineFunctionalModel.ACCUMULATOR_TYPE,
         ), (
             f"Without quantization, the output type has to be equal to the "
-            f"accumulator type {NeurekaMemoryLayout.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
+            f"accumulator type {NeuralEngineFunctionalModel.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
         )
         return self
diff --git a/test/NnxTestClasses.py b/test/NnxTestClasses.py
index c13ebb3..e829f31 100644
--- a/test/NnxTestClasses.py
+++ b/test/NnxTestClasses.py
@@ -17,13 +17,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
-from typing import Callable, Union, Optional, Set, Tuple, Type
+from typing import Callable, Literal, Union, Optional, Set, Tuple, Type
 import torch
 import numpy as np
 import numpy.typing as npt
 import torch.nn.functional as F
 import os
 from HeaderWriter import HeaderWriter
+from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel
 from TestClasses import IntegerType, Stride, Padding, KernelShape, implies, xor
 from pydantic import BaseModel, PositiveInt, field_validator, model_validator
 
@@ -182,36 +183,21 @@ class NnxTestGenerator:
     _DEFAULT_SEED = 0
 
     @staticmethod
-    def _global_shift(
-        tensor: torch.Tensor, out_type: IntegerType, has_relu: bool
+    def _calculate_global_shift(
+        tensor: torch.Tensor, out_type: IntegerType
     ) -> torch.Tensor:
-        if has_relu:
-            # only adjust positive values
-            tensor = tensor[tensor > 0]
-
+        """Calculate global shift so that the output values are in the range of out_type"""
         s = tensor.type(torch.float64).std()
         target_s = 2 ** (out_type._bits - 1)
-        global_shift = torch.ceil(torch.log2(s / target_s)).type(torch.int32)
-
-        return global_shift
+        return torch.ceil(torch.log2(s / target_s)).type(torch.int32)
 
     @staticmethod
-    def _random_data(_type: IntegerType, shape: Tuple[int, int, int, int]):
+    def _random_data(_type: IntegerType, shape: Tuple):
         return torch.randint(_type.min, _type.max, size=shape)
 
-    @staticmethod
-    def _cast(
-        tensor: torch.Tensor, _type: IntegerType, saturate: bool = False
-    ) -> torch.Tensor:
-        if saturate:
-            return tensor.clamp(_type.min, _type.max)
-        else:
-            return tensor & ((1 << _type._bits) - 1)
-
     @staticmethod
     def from_conf(
         conf: NnxTestConf,
-        accumulator_type: IntegerType,
         input: Optional[torch.Tensor] = None,
         weight: Optional[torch.Tensor] = None,
         scale: Optional[torch.Tensor] = None,
@@ -221,89 +207,49 @@ def from_conf(
     ) -> NnxTest:
         torch.manual_seed(NnxTestGenerator._DEFAULT_SEED)
 
+        input_shape = (1, conf.in_channel, conf.in_height, conf.in_width)
+        weight_shape = (
+            conf.out_channel,
+            1 if conf.depthwise else conf.in_channel,
+            conf.kernel_shape.height,
+            conf.kernel_shape.width,
+        )
+        scale_shape = (1, conf.out_channel, 1, 1)
+        bias_shape = (1, conf.out_channel, 1, 1)
+
         if input is None:
             input = NnxTestGenerator._random_data(
                 _type=conf.in_type,
-                shape=(1, conf.in_channel, conf.in_height, conf.in_width),
+                shape=input_shape,
             )
 
-        input_padded = F.pad(
-            input,
-            (
-                conf.padding.left,
-                conf.padding.right,
-                conf.padding.top,
-                conf.padding.bottom,
-            ),
-            "constant",
-            0,
-        )
-
         if weight is None:
             weight = NnxTestGenerator._random_data(
                 _type=conf.weight_type,
-                shape=(
-                    conf.out_channel,
-                    1 if conf.depthwise else conf.in_channel,
-                    conf.kernel_shape.height,
-                    conf.kernel_shape.width,
-                ),
+                shape=weight_shape,
             )
 
-        # Accumulators are 32bit non-saturating.
-        # Calculate in higher precision (int64)
-        output = F.conv2d(
-            input=input_padded,
-            weight=weight,
-            stride=(conf.stride.height, conf.stride.width),
-            groups=conf.in_channel if conf.depthwise else 1,
-        ).type(torch.int64)
-        # Use only the lower 32bits
-        output = NnxTestGenerator._cast(output, accumulator_type, saturate=False).type(
-            torch.int32
-        )
-
-        if verbose:
-            print("INTERMEDIATE RESULTS (pre-normalization/requant):")
-            print(output)
-
         if conf.has_norm_quant:
             if scale is None:
                 assert conf.scale_type is not None
                 scale = NnxTestGenerator._random_data(
-                    conf.scale_type, shape=(1, conf.out_channel, 1, 1)
+                    conf.scale_type, shape=scale_shape
                 )
-            # Scale accumulators are in 48bit, so keeping the data in 64bit
-            output = scale * output
-            assert output.dtype == torch.int64
-
-            if conf.has_bias:
-                # Saturating cast to int32
+            if conf.has_bias and bias is None:
                 assert conf.bias_type is not None
-                output = NnxTestGenerator._cast(
-                    output, conf.bias_type, saturate=True
+                bias = NnxTestGenerator._random_data(
+                    conf.bias_type, shape=bias_shape
                 ).type(torch.int32)
-
-                if bias is None:
-                    bias = NnxTestGenerator._random_data(
-                        conf.bias_type, shape=(1, conf.out_channel, 1, 1)
-                    ).type(torch.int32)
-                output = output + bias
-                output = NnxTestGenerator._cast(
-                    output, conf.bias_type, saturate=False
-                ).type(torch.int32)
-
-            if conf.has_relu:
-                output = F.relu(output)
-
             if global_shift is None:
-                global_shift = NnxTestGenerator._global_shift(
-                    output, conf.out_type, conf.has_relu
+                global_shift = torch.Tensor([0]).type(torch.int32)
+                output = NeuralEngineFunctionalModel().convolution(
+                    input, weight, scale, bias, global_shift, verbose=verbose, **conf.__dict__
                 )
-            output = output >> global_shift
+                NnxTestGenerator._calculate_global_shift(output, conf.out_type)
 
-            # Saturate into out_type
-            output = NnxTestGenerator._cast(output, conf.out_type, saturate=True)
+        output = NeuralEngineFunctionalModel().convolution(
+            input, weight, scale, bias, global_shift, verbose=verbose, **conf.__dict__
+        )
 
         return NnxTest(
             conf=conf,
@@ -328,7 +274,7 @@ class NnxTestHeaderGenerator:
 
     def __init__(
         self,
-        weight_unroll: Callable[
+        weightEncode: Callable[
             [npt.NDArray[np.uint8], int, bool], npt.NDArray[np.uint8]
         ],
         headers_dir: Optional[Union[str, os.PathLike]] = None,
@@ -338,7 +284,7 @@ def __init__(
         self.header_writer = HeaderWriter(headers_dir)
         # function that takes the weights in CoutCinK format, bitwidth, and a depthwise flag,
         # and returns a numpy array of dtype=np.uint8 of data in a layout correct for the accelerator
-        self.weight_unroll = weight_unroll
+        self.weightEncode = weightEncode
 
     def generate(self, test_name: str, test: NnxTest):
         assert test.input is not None and test.output is not None
@@ -371,7 +317,7 @@ def generate(self, test_name: str, test: NnxTest):
         weight_offset = -(2 ** (weight_bits - 1))
         weight_out_ch, weight_in_ch, weight_ks_h, weight_ks_w = test.weight.shape
         weight_data: np.ndarray = test.weight.numpy() - weight_offset
-        weight_init = self.weight_unroll(
+        weight_init = self.weightEncode(
             weight_data.astype(np.uint8),
             weight_type._bits,
             test.conf.depthwise,
diff --git a/test/conftest.py b/test/conftest.py
index 09a18c1..b812434 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -105,7 +105,7 @@ def pytest_generate_tests(metafunc):
             test = NnxTest.load(nnxTestConfCls, test_dir)
             # (Re)generate data
             if not test.is_valid() or regenerate:
-                test = NnxTestGenerator.from_conf(test.conf, nnxMemoryLayoutCls.ACCUMULATOR_TYPE)
+                test = NnxTestGenerator.from_conf(test.conf)
                 test.save_data(test_dir)
             nnxTestAndNames.append((test, test_dir))
         except pydantic.ValidationError as e:
diff --git a/test/testgen.py b/test/testgen.py
index 899f0a1..b8ccd79 100644
--- a/test/testgen.py
+++ b/test/testgen.py
@@ -43,9 +43,7 @@ def headers_gen(
         test = NnxTest.load(nnxTestConfCls, args.test_dir)
     assert test is not None
     if not test.is_valid():
-        test = NnxTestGenerator.from_conf(
-            test.conf, nnxMemoryLayoutCls.ACCUMULATOR_TYPE
-        )
+        test = NnxTestGenerator.from_conf(test.conf)
     NnxTestHeaderGenerator(nnxMemoryLayoutCls.weightEncode).generate(
         args.test_dir, test
     )
@@ -83,9 +81,7 @@ def test_gen(
         exit(-1)
 
     test_conf = nnxTestConfCls.model_validate(test_conf_dict)
-    test = NnxTestGenerator.from_conf(
-        test_conf, nnxMemoryLayoutCls.ACCUMULATOR_TYPE, verbose=args.print_tensors
-    )
+    test = NnxTestGenerator.from_conf(test_conf, verbose=args.print_tensors)
     if not args.skip_save:
         test.save(args.test_dir)
     if args.headers:

From b743930b3c9c22ea064fa0a256eaae75de829717 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 25 Jan 2024 15:22:12 +0100
Subject: [PATCH 42/72] Remove conf from NeuralEngineFunctionalModel

---
 test/NeuralEngineFunctionalModel.py | 56 +++++++++++++++++------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/test/NeuralEngineFunctionalModel.py b/test/NeuralEngineFunctionalModel.py
index 5378fa4..a264115 100644
--- a/test/NeuralEngineFunctionalModel.py
+++ b/test/NeuralEngineFunctionalModel.py
@@ -1,8 +1,7 @@
 from typing import Optional
 import torch
 import torch.nn.functional as F
-from NnxTestClasses import NnxTestConf
-from TestClasses import IntegerType
+from TestClasses import IntegerType, Padding, Stride
 
 
 class NeuralEngineFunctionalModel:
@@ -19,56 +18,69 @@ def _cast(
 
     def _norm_quant(
         self,
-        conf: NnxTestConf,
         tensor: torch.Tensor,
         scale: torch.Tensor,
         bias: Optional[torch.Tensor],
         global_shift: torch.Tensor,
+        out_type: IntegerType,
+        bias_type: Optional[IntegerType],
+        has_bias: bool,
+        has_relu: bool,
     ) -> torch.Tensor:
         # Scale accumulators are in 48bit, so keeping the data in 64bit
-        tensor = scale * tensor
+        tensor = tensor * scale
         assert tensor.dtype == torch.int64
 
-        if conf.has_bias:
+        if has_bias:
             assert bias is not None
-            assert conf.bias_type is not None
+            assert bias_type is not None
             # Saturating cast to int32
             tensor = NeuralEngineFunctionalModel._cast(
-                tensor, conf.bias_type, saturate=True
+                tensor, bias_type, saturate=True
             ).type(torch.int32)
 
             tensor = tensor + bias
             tensor = NeuralEngineFunctionalModel._cast(
-                tensor, conf.bias_type, saturate=False
+                tensor, bias_type, saturate=False
             ).type(torch.int32)
 
-        if conf.has_relu:
+        if has_relu:
             tensor = F.relu(tensor)
 
         tensor = tensor >> global_shift
 
         # Saturate into out_type
-        tensor = NeuralEngineFunctionalModel._cast(tensor, conf.out_type, saturate=True)
+        tensor = NeuralEngineFunctionalModel._cast(tensor, out_type, saturate=True)
 
         return tensor
 
     def convolution(
         self,
-        conf: NnxTestConf,
         input: torch.Tensor,
         weight: torch.Tensor,
-        scale: Optional[torch.Tensor] = None,
-        bias: Optional[torch.Tensor] = None,
-        global_shift: Optional[torch.Tensor] = None,
+        scale: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor],
+        global_shift: Optional[torch.Tensor],
+        padding: Padding,
+        stride: Stride,
+        depthwise: bool,
+        out_type: IntegerType,
+        bias_type: Optional[IntegerType],
+        has_norm_quant: bool,
+        has_bias: bool,
+        has_relu: bool,
         verbose: bool = False,
+        **kwargs,
     ) -> torch.Tensor:
+        _ = kwargs
+
         input_padded = F.pad(
             input,
             (
-                conf.padding.left,
-                conf.padding.right,
-                conf.padding.top,
-                conf.padding.bottom,
+                padding.left,
+                padding.right,
+                padding.top,
+                padding.bottom,
             ),
             "constant",
             0,
@@ -79,8 +91,8 @@ def convolution(
         output = F.conv2d(
             input=input_padded,
             weight=weight,
-            stride=(conf.stride.height, conf.stride.width),
-            groups=conf.in_channel if conf.depthwise else 1,
+            stride=(stride.height, stride.width),
+            groups=weight.shape[0] if depthwise else 1,
         ).type(torch.int64)
 
         # Cast to accumulator type
@@ -92,9 +104,9 @@ def convolution(
             print("INTERMEDIATE RESULTS (pre-normalization/requant):")
             print(output)
 
-        if conf.has_norm_quant:
+        if has_norm_quant:
             assert scale is not None
             assert global_shift is not None
-            output = self._norm_quant(conf, output, scale, bias, global_shift)
+            output = self._norm_quant(output, scale, bias, global_shift, out_type, bias_type, has_bias, has_relu)
 
         return output

From f425ea57f57278d0a24af076f0352aad4e10b64f Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 26 Jan 2024 09:08:37 +0100
Subject: [PATCH 43/72] Add isort

---
 .gitlab-ci.yml                      | 11 +++++++++--
 test/.isort.cfg                     |  4 ++++
 test/Ne16TestConf.py                |  9 ++++++---
 test/NeuralEngineFunctionalModel.py | 13 ++++++++++++-
 test/NeurekaMemoryLayout.py         |  1 +
 test/NeurekaTestConf.py             |  9 ++++++---
 test/NnxTestClasses.py              | 22 +++++++++++++++-------
 test/TestClasses.py                 |  9 +++++----
 test/conftest.py                    |  4 ++--
 test/requirements-dev.txt           |  1 +
 test/test.py                        |  7 ++++---
 test/testgen.py                     |  6 ++++--
 12 files changed, 69 insertions(+), 27 deletions(-)
 create mode 100644 test/.isort.cfg

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b8357a4..4c7b267 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -20,14 +20,21 @@ stages:
   - lint
   - test
 
-format_python:
+python_format:
   stage: lint
   tags:
     - python-lint
   script:
     - black --check .
 
-static_check_python:
+python_sort_imports:
+  stage: lint
+  tags:
+    - python-lint
+  script:
+    - isort --check test
+
+python_static_check:
   stage: lint
   tags:
     - python-lint
diff --git a/test/.isort.cfg b/test/.isort.cfg
new file mode 100644
index 0000000..127bf37
--- /dev/null
+++ b/test/.isort.cfg
@@ -0,0 +1,4 @@
+[settings]
+profile=black
+line_length=88
+skip_gitignore=true
diff --git a/test/Ne16TestConf.py b/test/Ne16TestConf.py
index 2470421..1c2b3b6 100644
--- a/test/Ne16TestConf.py
+++ b/test/Ne16TestConf.py
@@ -17,11 +17,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
-from typing import List, Union, Optional
+
+from typing import List, Optional, Union
+
+from pydantic import field_validator, model_validator
+
 from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel
 from NnxTestClasses import NnxTestConf
-from TestClasses import implies, KernelShape, Stride, IntegerType
-from pydantic import field_validator, model_validator
+from TestClasses import IntegerType, KernelShape, Stride, implies
 
 
 class Ne16TestConf(NnxTestConf):
diff --git a/test/NeuralEngineFunctionalModel.py b/test/NeuralEngineFunctionalModel.py
index a264115..08b3601 100644
--- a/test/NeuralEngineFunctionalModel.py
+++ b/test/NeuralEngineFunctionalModel.py
@@ -1,6 +1,8 @@
 from typing import Optional
+
 import torch
 import torch.nn.functional as F
+
 from TestClasses import IntegerType, Padding, Stride
 
 
@@ -107,6 +109,15 @@ def convolution(
         if has_norm_quant:
             assert scale is not None
             assert global_shift is not None
-            output = self._norm_quant(output, scale, bias, global_shift, out_type, bias_type, has_bias, has_relu)
+            output = self._norm_quant(
+                output,
+                scale,
+                bias,
+                global_shift,
+                out_type,
+                bias_type,
+                has_bias,
+                has_relu,
+            )
 
         return output
diff --git a/test/NeurekaMemoryLayout.py b/test/NeurekaMemoryLayout.py
index 52d1f53..80a2786 100644
--- a/test/NeurekaMemoryLayout.py
+++ b/test/NeurekaMemoryLayout.py
@@ -19,6 +19,7 @@
 
 import numpy as np
 import numpy.typing as npt
+
 from TestClasses import IntegerType
 
 
diff --git a/test/NeurekaTestConf.py b/test/NeurekaTestConf.py
index 038e66b..d896a7e 100644
--- a/test/NeurekaTestConf.py
+++ b/test/NeurekaTestConf.py
@@ -17,11 +17,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
-from typing import List, Union, Optional
+
+from typing import List, Optional, Union
+
+from pydantic import field_validator, model_validator
+
 from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel
 from NnxTestClasses import NnxTestConf
-from TestClasses import implies, KernelShape, Stride, IntegerType
-from pydantic import field_validator, model_validator
+from TestClasses import IntegerType, KernelShape, Stride, implies
 
 
 class NeurekaTestConf(NnxTestConf):
diff --git a/test/NnxTestClasses.py b/test/NnxTestClasses.py
index e829f31..5deb285 100644
--- a/test/NnxTestClasses.py
+++ b/test/NnxTestClasses.py
@@ -17,16 +17,18 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
-from typing import Callable, Literal, Union, Optional, Set, Tuple, Type
-import torch
+
+import os
+from typing import Callable, Optional, Set, Tuple, Type, Union
+
 import numpy as np
 import numpy.typing as npt
-import torch.nn.functional as F
-import os
+import torch
+from pydantic import BaseModel, PositiveInt, field_validator, model_validator
+
 from HeaderWriter import HeaderWriter
 from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel
-from TestClasses import IntegerType, Stride, Padding, KernelShape, implies, xor
-from pydantic import BaseModel, PositiveInt, field_validator, model_validator
+from TestClasses import IntegerType, KernelShape, Padding, Stride, implies, xor
 
 
 class NnxTestConf(BaseModel):
@@ -243,7 +245,13 @@ def from_conf(
             if global_shift is None:
                 global_shift = torch.Tensor([0]).type(torch.int32)
                 output = NeuralEngineFunctionalModel().convolution(
-                    input, weight, scale, bias, global_shift, verbose=verbose, **conf.__dict__
+                    input,
+                    weight,
+                    scale,
+                    bias,
+                    global_shift,
+                    verbose=verbose,
+                    **conf.__dict__,
                 )
                 NnxTestGenerator._calculate_global_shift(output, conf.out_type)
 
diff --git a/test/TestClasses.py b/test/TestClasses.py
index 450ba21..d518b59 100644
--- a/test/TestClasses.py
+++ b/test/TestClasses.py
@@ -16,15 +16,16 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from functools import cached_property
 import re
-from typing import Any, Dict, Literal, Optional, TYPE_CHECKING
+from functools import cached_property
+from typing import TYPE_CHECKING, Any, Dict, Literal, Optional
+
 from pydantic import (
     BaseModel,
-    model_serializer,
-    model_validator,
     NonNegativeInt,
     PositiveInt,
+    model_serializer,
+    model_validator,
 )
 
 
diff --git a/test/conftest.py b/test/conftest.py
index b812434..3c0a316 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -19,15 +19,15 @@
 import os
 from typing import Union
 
-import pytest
 import pydantic
+import pytest
+
 from Ne16MemoryLayout import Ne16MemoryLayout
 from Ne16TestConf import Ne16TestConf
 from NeurekaMemoryLayout import NeurekaMemoryLayout
 from NeurekaTestConf import NeurekaTestConf
 from NnxTestClasses import NnxTest, NnxTestGenerator
 
-
 _SUPPORTED_ACCELERATORS = ["ne16", "neureka"]
 
 
diff --git a/test/requirements-dev.txt b/test/requirements-dev.txt
index fa0a75a..0956e5e 100644
--- a/test/requirements-dev.txt
+++ b/test/requirements-dev.txt
@@ -1,2 +1,3 @@
 pyright
 black
+isort
diff --git a/test/test.py b/test/test.py
index 3ee7ea1..1893cdf 100644
--- a/test/test.py
+++ b/test/test.py
@@ -16,15 +16,16 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import locale
 import os
 import re
-from typing import Dict, Union, Optional, Tuple, Type
-import locale
 import subprocess
+from pathlib import Path
+from typing import Dict, Optional, Tuple, Type, Union
+
 from Ne16MemoryLayout import Ne16MemoryLayout
 from NeurekaMemoryLayout import NeurekaMemoryLayout
 from NnxTestClasses import NnxTest, NnxTestConf, NnxTestHeaderGenerator
-from pathlib import Path
 
 HORIZONTAL_LINE = "\n" + "-" * 100 + "\n"
 
diff --git a/test/testgen.py b/test/testgen.py
index b8ccd79..521aecc 100644
--- a/test/testgen.py
+++ b/test/testgen.py
@@ -16,11 +16,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import os
 import argparse
 import json
+import os
+from typing import Optional, Set, Type, Union
+
 import toml
-from typing import Optional, Type, Union, Set
+
 from Ne16MemoryLayout import Ne16MemoryLayout
 from Ne16TestConf import Ne16TestConf
 from NeurekaMemoryLayout import NeurekaMemoryLayout

From 932847d8f6213ca87b8281d1110e238e79b87c48 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 26 Jan 2024 14:35:36 +0100
Subject: [PATCH 44/72] Remove neureka siracusa clock gating

---
 neureka/bsp/neureka_siracusa_bsp.c | 13 -------------
 neureka/bsp/neureka_siracusa_bsp.h | 14 --------------
 2 files changed, 27 deletions(-)

diff --git a/neureka/bsp/neureka_siracusa_bsp.c b/neureka/bsp/neureka_siracusa_bsp.c
index 5021e3f..9437250 100644
--- a/neureka/bsp/neureka_siracusa_bsp.c
+++ b/neureka/bsp/neureka_siracusa_bsp.c
@@ -26,7 +26,6 @@
 #define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR                                \
   (NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR +                                   \
    NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS)
-#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_CG_EN 0x800
 #define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO 0x100
 #define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL 0xff
 #define NEUREKA_SIRACUSA_MAX_STALL (8)
@@ -36,16 +35,6 @@
 #define NEUREKA_SIRACUSA_WEIGHT_MEM_MRAM_OFFSET (0x00000000)
 #define NEUREKA_SIRACUSA_WEIGHT_MEM_SRAM_OFFSET (0x00400000)
 
-void neureka_siracusa_cg_enable() {
-  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |=
-      NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_CG_EN;
-}
-
-void neureka_siracusa_cg_disable() {
-  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &=
-      ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_CG_EN;
-}
-
 // TODO: Check if needed for neureka
 void neureka_siracusa_hci_setpriority_neureka() {
   *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |=
@@ -71,7 +60,6 @@ void neureka_siracusa_hci_set_max_stall(uint32_t max_stall) {
 }
 
 void neureka_siracusa_open(neureka_siracusa_conf_t *conf) {
-  neureka_siracusa_cg_enable();
   neureka_siracusa_hci_setpriority_neureka();
   neureka_siracusa_hci_set_max_stall(conf->max_stall);
 }
@@ -79,7 +67,6 @@ void neureka_siracusa_open(neureka_siracusa_conf_t *conf) {
 void neureka_siracusa_close() {
   neureka_siracusa_hci_reset_max_stall();
   neureka_siracusa_hci_setpriority_core();
-  neureka_siracusa_cg_disable();
 }
 
 void neureka_siracusa_event_wait_and_clear() {
diff --git a/neureka/bsp/neureka_siracusa_bsp.h b/neureka/bsp/neureka_siracusa_bsp.h
index 9e879e8..be75a20 100644
--- a/neureka/bsp/neureka_siracusa_bsp.h
+++ b/neureka/bsp/neureka_siracusa_bsp.h
@@ -24,20 +24,6 @@
 #include "neureka.h"
 #include <stdint.h>
 
-/**
- * neureka_siracusa_cg_enable
- *
- * Enable clock gating of the neureka.
- */
-void neureka_siracusa_cg_enable();
-
-/**
- * neureka_siracusa_cg_enable
- *
- * Disable clock gating of the neureka.
- */
-void neureka_siracusa_cg_disable();
-
 /**
  * neureka_siracusa_setpriority_neureka
  *

From 7ff3fcbefe0e7815e8ecaa77b74238059fe1460a Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 26 Jan 2024 14:37:58 +0100
Subject: [PATCH 45/72] Remove inline from hal

---
 ne16/hal/ne16_task.c       | 10 +++++-----
 neureka/hal/neureka_task.c | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/ne16/hal/ne16_task.c b/ne16/hal/ne16_task.c
index 7e405f9..8472372 100644
--- a/ne16/hal/ne16_task.c
+++ b/ne16/hal/ne16_task.c
@@ -22,7 +22,7 @@
 #include "ne16_task_defs.h"
 #include "pulp_nnx_util.h"
 
-inline uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
+uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
                                       uint32_t i_width, uint32_t n_height,
                                       uint32_t n_width) {
   uint32_t tile_padding = padding;
@@ -84,14 +84,14 @@ void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape,
  * it was the start to the padded data.
  * Necessary for input pointer when it's padded.
  */
-inline uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width,
+uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width,
                              const uint32_t channel, const uint8_t bits,
                              const uint8_t padding_top,
                              const uint8_t padding_left) {
   return ptr - (padding_top * width + padding_left) * channel * bits / 8;
 }
 
-inline void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr,
+void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr,
                                uint32_t w_in, uint32_t k_in, uint8_t bits_in,
                                uint8_t padding_top, uint8_t padding_left,
                                uint32_t output_ptr, uint32_t weights_ptr,
@@ -170,7 +170,7 @@ void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in,
   task->data.cfg.subtile = subtile;
 }
 
-inline void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
+void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
                                   const uint8_t bottom, const uint8_t left,
                                   const uint8_t right, const uint8_t value) {
   task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) |
@@ -178,7 +178,7 @@ inline void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
                            (value & 0xff);
 }
 
-inline void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
+void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
                                       const uint8_t right, const uint8_t bottom,
                                       const uint8_t left) {
   task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
index 1b210ec..2c8823c 100644
--- a/neureka/hal/neureka_task.c
+++ b/neureka/hal/neureka_task.c
@@ -22,7 +22,7 @@
 #include "neureka_task_defs.h"
 #include "pulp_nnx_util.h"
 
-inline uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
+uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
                                          uint32_t i_width, uint32_t n_height,
                                          uint32_t n_width) {
   uint32_t tile_padding = padding;
@@ -83,14 +83,14 @@ void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
  * it was the start to the padded data.
  * Necessary for input pointer when it's padded.
  */
-inline uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
+uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
                                 const uint32_t channel, const uint8_t bits,
                                 const uint8_t padding_top,
                                 const uint8_t padding_left) {
   return ptr - (padding_top * width + padding_left) * channel * bits / 8;
 }
 
-inline void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
+void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
                                   uint32_t w_in, uint32_t k_in, uint8_t bits_in,
                                   uint8_t padding_top, uint8_t padding_left,
                                   uint32_t output_ptr, uint32_t weights_ptr,
@@ -169,7 +169,7 @@ void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
   task->data.cfg.subtile = subtile;
 }
 
-inline void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
+void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
                                      const uint8_t bottom, const uint8_t left,
                                      const uint8_t right, const uint8_t value) {
   task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) |
@@ -177,7 +177,7 @@ inline void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
                            (value & 0xff);
 }
 
-inline void neureka_task_set_mask_filter(neureka_task_t *task,
+void neureka_task_set_mask_filter(neureka_task_t *task,
                                          const uint8_t top, const uint8_t right,
                                          const uint8_t bottom,
                                          const uint8_t left) {

From 772fd951234d8f0d69454d8aaec726f018f8e3a6 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 26 Jan 2024 14:50:18 +0100
Subject: [PATCH 46/72] Removed xor, python has xor

---
 test/NnxTestClasses.py | 4 ++--
 test/TestClasses.py    | 4 ----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/test/NnxTestClasses.py b/test/NnxTestClasses.py
index 5deb285..5d45c44 100644
--- a/test/NnxTestClasses.py
+++ b/test/NnxTestClasses.py
@@ -28,7 +28,7 @@
 
 from HeaderWriter import HeaderWriter
 from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel
-from TestClasses import IntegerType, KernelShape, Padding, Stride, implies, xor
+from TestClasses import IntegerType, KernelShape, Padding, Stride, implies
 
 
 class NnxTestConf(BaseModel):
@@ -81,7 +81,7 @@ def check_valid_norm_quant_types_when_has_norm_qunat(self) -> NnxTestConf:
 
     @model_validator(mode="after")  # type: ignore
     def check_valid_out_type_with_relu(self) -> NnxTestConf:
-        assert xor(self.has_relu, self.out_type._signed), (
+        assert self.has_relu ^ self.out_type._signed, (
             f"Output type has to be unsigned when there is relu, otherwise signed. "
             f"Given output type {self.out_type} and has_relu {self.has_relu}"
         )
diff --git a/test/TestClasses.py b/test/TestClasses.py
index d518b59..c6267d6 100644
--- a/test/TestClasses.py
+++ b/test/TestClasses.py
@@ -33,10 +33,6 @@ def implies(a: bool, b: bool):
     return (not a) or b
 
 
-def xor(a: bool, b: bool):
-    return (a and not b) or (not a and b)
-
-
 class KernelShape(BaseModel):
     height: PositiveInt
     width: PositiveInt

From 2223af9d2592913a60b2e8e5389785988caebe52 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 26 Jan 2024 17:23:57 +0100
Subject: [PATCH 47/72] WIP: Add without norm_quant

---
 ne16/hal/ne16_task.c     | 66 +++++++++++++++++++++++-----------------
 ne16/hal/ne16_task.h     | 16 +++++-----
 test/Ne16TestConf.py     |  2 +-
 test/NnxTestClasses.py   | 30 ++++++++++++------
 test/app/src/nnx_layer.c | 61 ++++++++++++++++++++++++-------------
 5 files changed, 108 insertions(+), 67 deletions(-)

diff --git a/ne16/hal/ne16_task.c b/ne16/hal/ne16_task.c
index 8472372..5ac443c 100644
--- a/ne16/hal/ne16_task.c
+++ b/ne16/hal/ne16_task.c
@@ -41,41 +41,52 @@ uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
   return tile_padding;
 }
 
-void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape,
-                    const uint8_t depthwise, const uint8_t input_bits,
-                    const uint8_t output_bits, const uint8_t weights_bits,
-                    const ne16_weight_offset_mode_e weights_offset_mode,
-                    const uint32_t weights_offset_factor, ne16_quant_t quant,
-                    ne16_norm_t norm, const uint8_t stride) {
-  const uint32_t flag_mode16 =
-      input_bits == 16 ? NE16_FLAG_MODE16 : NE16_FLAG_MODE_BASIC;
-
-  *task = (ne16_task_t){
-      .outbytes = output_bits / 8,
-      .weight_d0_stride = flag_mode16 ? NE16_WEIGHT_D0_STRIDE_MODE16
-                                      : NE16_WEIGHT_D0_STRIDE_MODE8,
-      .qw = weights_bits,
-      .stride_shift = stride == 2 ? 1 : 0,
-      .output_channel_throughput = depthwise ? NE16_INPUT_CHANNEL_THROUGHPUT
-                                             : NE16_OUTPUT_CHANNEL_THROUGHPUT,
-      .kernel_shape = kernel_shape,
-      .depthwise = depthwise,
-      .data = {0}};
+void ne16_task_init(ne16_task_t *task) {
+    *task = (ne16_task_t) { .data = {0} };
+    task->data.cfg.conf0 |= quantMode32Bit;
+}
 
-  const int flag_stride2x2 = stride == 2 ? NE16_FLAG_STRIDE_2x2 : 0;
+void ne16_task_set_op_to_conv(ne16_task_t *task, const uint8_t kernel_shape,
+                              const uint8_t depthwise, const uint8_t stride) {
+  task->depthwise = depthwise;
+  task->kernel_shape = kernel_shape;
+  task->output_channel_throughput = depthwise ? NE16_INPUT_CHANNEL_THROUGHPUT
+                                             : NE16_OUTPUT_CHANNEL_THROUGHPUT;
+  task->stride_shift = stride == 2 ? 1 : 0;
 
   const int flag_mode = kernel_shape == 1 ? NE16_FLAG_MODE_1x1
                         : depthwise == 1  ? NE16_FLAG_MODE_3x3_DW
                                           : NE16_FLAG_MODE_3x3;
 
+  const int flag_stride2x2 = stride == 2 ? NE16_FLAG_STRIDE_2x2 : 0;
+
+  task->data.cfg.conf0 |= flag_mode | flag_stride2x2;
+}
+
+void ne16_task_set_bits(ne16_task_t *task, const uint8_t input_bits,
+                        const uint8_t output_bits, const uint8_t weight_bits) {
+  const uint32_t flag_mode16 =
+      input_bits == 16 ? NE16_FLAG_MODE16 : NE16_FLAG_MODE_BASIC;
+
+  task->out_d0_stride = 256 / output_bits;
+  task->weight_d0_stride = flag_mode16 ? NE16_WEIGHT_D0_STRIDE_MODE16
+                                       : NE16_WEIGHT_D0_STRIDE_MODE8;
+  task->qw = weight_bits;
+  task->data.cfg.conf0 |= flag_mode16 | (weight_bits - 1);
+}
+
+void ne16_task_set_norm_quant(ne16_task_t *task,
+                    ne16_quant_t quant, ne16_norm_t norm) {
   task->data.cfg.conf0 |=
       NE16_FLAG_NORM_QUANT | quant.function | quant.mode |
       (quant.shift_amount << 16) | quant.flag_rounding << NE16_SHIFT_ROUNDING |
       norm.mode | norm.flag_bias << NE16_SHIFT_FLAG_NORM_BIAS |
-      norm.flag_shift << NE16_SHIFT_FLAG_NORM_SHIFT | weights_offset_mode |
-      flag_mode | flag_mode16 | (weights_bits - 1) | flag_stride2x2;
+      norm.flag_shift << NE16_SHIFT_FLAG_NORM_SHIFT;
+}
 
-  task->data.cfg.weight_offset_factor = weights_offset_factor;
+void ne16_task_set_weight_offset(ne16_task_t *task, ne16_weight_offset_mode_e weight_offset_mode, const int32_t weight_offset) {
+  task->data.cfg.conf0 |= weight_offset_mode;
+  task->data.cfg.weight_offset_factor = weight_offset;
 }
 
 /** ne16_pad_ptr
@@ -119,10 +130,9 @@ void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in,
 
   // WARNING: Stride works only for even output channel sizes (divisible by 2)
   const ne16_stride_t output_stride = {
-      .d0 = 32,
-      .d1 = (k_out_stride * task->outbytes) >> task->stride_shift,
-      .d2 =
-          (k_out_stride * task->outbytes * w_out_stride) >> task->stride_shift};
+      .d0 = task->out_d0_stride,
+      .d1 = k_out_stride >> task->stride_shift,
+      .d2 = (k_out_stride * w_out_stride) >> task->stride_shift};
   task->data.cfg.output_stride = output_stride;
 
   if (task->kernel_shape == 1) {
diff --git a/ne16/hal/ne16_task.h b/ne16/hal/ne16_task.h
index 0823b81..4bacac4 100644
--- a/ne16/hal/ne16_task.h
+++ b/ne16/hal/ne16_task.h
@@ -110,7 +110,7 @@ typedef struct ne16_task_data_t {
 
 typedef struct ne16_task_t {
   ne16_task_data_t data;
-  uint8_t outbytes;
+  uint8_t out_d0_stride;
   uint8_t weight_d0_stride;
   uint8_t qw;
   uint8_t stride_shift;
@@ -120,12 +120,14 @@ typedef struct ne16_task_t {
   uint8_t id;
 } ne16_task_t;
 
-void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape,
-                    const uint8_t depthwise, const uint8_t input_bits,
-                    const uint8_t output_bits, const uint8_t weights_bits,
-                    const ne16_weight_offset_mode_e weights_offset_mode,
-                    const uint32_t weights_offset_factor, ne16_quant_t quant,
-                    ne16_norm_t norm, const uint8_t stride);
+void ne16_task_init(ne16_task_t *task);
+void ne16_task_set_op_to_conv(ne16_task_t *task, const uint8_t kernel_shape,
+                              const uint8_t depthwise, const uint8_t stride);
+void ne16_task_set_bits(ne16_task_t *task, const uint8_t input_bits,
+                        const uint8_t output_bits, const uint8_t weight_bits);
+void ne16_task_set_norm_quant(ne16_task_t *task,
+                    ne16_quant_t quant, ne16_norm_t norm);
+void ne16_task_set_weight_offset(ne16_task_t *task, ne16_weight_offset_mode_e weight_offset_mode, const int32_t weight_offset);
 uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
                                uint32_t i_width, uint32_t n_height,
                                uint32_t n_width);
diff --git a/test/Ne16TestConf.py b/test/Ne16TestConf.py
index 1c2b3b6..efe75af 100644
--- a/test/Ne16TestConf.py
+++ b/test/Ne16TestConf.py
@@ -61,7 +61,7 @@ def check_valid_in_type(cls, v: IntegerType) -> IntegerType:
     @field_validator("out_type")
     @classmethod
     def check_valid_out_type(cls, v: IntegerType) -> IntegerType:
-        Ne16TestConf._check_type("out_type", v, ["uint8", "int8"])
+        Ne16TestConf._check_type("out_type", v, ["uint8", "int8", "int32"])
         return v
 
     @field_validator("weight_type")
diff --git a/test/NnxTestClasses.py b/test/NnxTestClasses.py
index 5d45c44..90cfc71 100644
--- a/test/NnxTestClasses.py
+++ b/test/NnxTestClasses.py
@@ -65,12 +65,6 @@ def check_valid_padding_with_kernel_shape_1x1(self) -> NnxTestConf:
         ), f"No padding on 1x1 kernel. Given padding {self.padding}"
         return self
 
-    @field_validator("has_norm_quant")
-    @classmethod
-    def check_valid_has_norm_quant(cls, v: bool) -> bool:
-        assert v == True, f"Untested without has_norm_quant."
-        return v
-
     @model_validator(mode="after")  # type: ignore
     def check_valid_norm_quant_types_when_has_norm_qunat(self) -> NnxTestConf:
         if self.has_norm_quant:
@@ -79,6 +73,22 @@ def check_valid_norm_quant_types_when_has_norm_qunat(self) -> NnxTestConf:
                 assert self.bias_type is not None, "Bias type was not provided."
         return self
 
+    @model_validator(mode="after")  # type: ignore
+    def check_has_relu_with_norm_quant(self) -> NnxTestConf:
+        assert implies(self.has_relu, self.has_norm_quant), (
+            f"Relu flag can only be enabled when norm_quant is enabled. "
+            f"Given has_relu {self.has_relu} and has_norm_quant {self.has_norm_quant}"
+        )
+        return self
+
+    @model_validator(mode="after")  # type: ignore
+    def check_has_bias_with_norm_quant(self) -> NnxTestConf:
+        assert implies(self.has_bias, self.has_norm_quant), (
+            f"Bias flag can only be enabled when norm_quant is enabled. "
+            f"Given has_bias {self.has_bias} and has_norm_quant {self.has_norm_quant}"
+        )
+        return self
+
     @model_validator(mode="after")  # type: ignore
     def check_valid_out_type_with_relu(self) -> NnxTestConf:
         assert self.has_relu ^ self.out_type._signed, (
@@ -365,13 +375,13 @@ def generate(self, test_name: str, test: NnxTest):
                     "width": in_width,
                     "channel": in_channel,
                     "signed": in_signed,
-                    "bits": 8,
+                    "bits": test.conf.in_type._bits,
                 },
                 "output": {
                     "height": out_height,
                     "width": out_width,
                     "channel": out_channel,
-                    "bits": 8,
+                    "bits": test.conf.out_type._bits,
                 },
                 "weight": {
                     "height": weight_ks_h,
@@ -381,8 +391,8 @@ def generate(self, test_name: str, test: NnxTest):
                     "bits": weight_bits,
                     "offset": weight_offset,
                 },
-                "scale": {"bits": 8},
-                "bias": {"bits": 32},
+                "scale": {"bits": test.conf.scale_type._bits if test.conf.scale_type is not None else 0},
+                "bias": {"bits": test.conf.bias_type._bits if test.conf.bias_type is not None else 0},
                 "padding": {
                     "top": test.conf.padding.top,
                     "bottom": test.conf.padding.bottom,
diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index 003e55e..540797d 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -102,44 +102,63 @@ typedef neureka_siracusa_conf_t nnx_bsp_conf_t;
 #include "weight.h"
 
 static void task_prepare(nnx_task_t *task) {
-  nnx_task_init(
-      task, WEIGHT_HEIGHT, GROUPS > 1, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS,
-      weightOffsetModeLayerWise, WEIGHT_OFFSET,
+  nnx_task_init(task);
+  ne16_task_set_op_to_conv(task, WEIGHT_HEIGHT, GROUPS > 1, STRIDE_HEIGHT);
+  ne16_task_set_bits(task, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS);
+
+#if HAS_NORM_QUANT == 1
+#if OUTPUT_BITS == 8
+  const ne16_quant_mode_e quantMode = quantMode8Bit;
+#elif OUTPUT_BITS == 32
+  const ne16_quant_mode_e quantMode = quantMode32Bit;
+#endif
+#if SCALE_BITS == 8
+  const ne16_norm_mode_e normMode = normMode8Bit;
+#elif SCALE_BITS == 32
+  const ne16_norm_mode_e normMode = normMode32Bit;
+#endif
+
+  ne16_task_set_norm_quant(
+      task,
       (nnx_quant_t){.shift_amount = OUTSHIFT,
-                    .mode = quantMode8Bit,
+                    .mode = quantMode,
                     .function =
                         HAS_RELU ? quantFunctionRelu : quantFunctionIdentity,
                     .flag_rounding = nnxTaskFlagFalse},
-      (nnx_norm_t){.mode = normMode8Bit,
+      (nnx_norm_t){.mode = normMode32Bit,
                    .flag_bias = HAS_BIAS ? nnxTaskFlagTrue : nnxTaskFlagFalse,
-                   .flag_shift = nnxTaskFlagFalse},
-#ifdef NNX_NE16
-      STRIDE_HEIGHT
-#elif NNX_NEUREKA
-      INPUT_SIGNED
-#endif
-  );
+                   .flag_shift = nnxTaskFlagFalse});
+#endif // HAS_NORM_QUANT
+  ne16_task_set_weight_offset(task, weightOffsetModeLayerWise, WEIGHT_OFFSET);
+
+  const uint32_t k_in_stride = INPUT_CHANNEL * INPUT_BITS / 8;
+  const uint32_t k_out_stride = OUTPUT_CHANNEL * OUTPUT_BITS / 8;
 
 #if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2
   nnx_task_set_dims_stride2x2(
-      task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
-      INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, OUTPUT_WIDTH,
-      OUTPUT_CHANNEL, WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP, PADDING_BOTTOM,
-      PADDING_RIGHT, PADDING_LEFT);
+      task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH, k_in_stride,
+      OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, OUTPUT_WIDTH, k_out_stride,
+      WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP, PADDING_BOTTOM, PADDING_RIGHT,
+      PADDING_LEFT);
 #else
-  nnx_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
-                    INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL,
-                    OUTPUT_WIDTH, OUTPUT_CHANNEL, PADDING_TOP, PADDING_BOTTOM,
-                    PADDING_RIGHT, PADDING_LEFT);
+  nnx_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH, k_in_stride,
+                    OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, OUTPUT_WIDTH,
+                    k_out_stride, PADDING_TOP, PADDING_BOTTOM, PADDING_RIGHT,
+                    PADDING_LEFT);
 #endif
 
   nnx_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, INPUT_CHANNEL,
                     INPUT_BITS, PADDING_TOP, PADDING_LEFT, (uint32_t)output,
-                    (uint32_t)weight, (uint32_t)scale, NULL,
+                    (uint32_t)weight,
+#if HAS_NORM_QUANT == 1
+                    (uint32_t)scale, NULL,
 #if HAS_BIAS == 1
                     (uint32_t)bias
 #else
                     NULL
+#endif
+#else
+                    NULL, NULL, NULL
 #endif
   );
 }

From 8232527ca95633abcee3c59f4eb648bbac257a06 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 26 Jan 2024 20:57:50 +0100
Subject: [PATCH 48/72] Remove -flto

---
 test/app/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/app/Makefile b/test/app/Makefile
index 75fc343..ca65892 100644
--- a/test/app/Makefile
+++ b/test/app/Makefile
@@ -63,7 +63,6 @@ APP_SRCS += $(wildcard gen/src/*.c)
 ACCELERATOR_UPPERCASE := $(shell echo $(ACCELERATOR) | tr [:lower:] [:upper:])
 APP_CFLAGS += -DNNX_ACCELERATOR=\"$(ACCELERATOR)\" -DNNX_$(ACCELERATOR_UPPERCASE)
 
-APP_CFLAGS += -O2 -w -Wall -Werror -flto
-APP_LDFLAGS += -flto
+APP_CFLAGS += -O2 -w -Wall -Werror
 
 include $(RULES_DIR)/pmsis_rules.mk

From df3f5ddc9d1f503e7579194a696fd731a64bf1b6 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 26 Jan 2024 20:58:40 +0100
Subject: [PATCH 49/72] Add -std=c11

---
 test/app/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/app/Makefile b/test/app/Makefile
index ca65892..d73353f 100644
--- a/test/app/Makefile
+++ b/test/app/Makefile
@@ -65,4 +65,8 @@ APP_CFLAGS += -DNNX_ACCELERATOR=\"$(ACCELERATOR)\" -DNNX_$(ACCELERATOR_UPPERCASE
 
 APP_CFLAGS += -O2 -w -Wall -Werror
 
+ifndef GAP_SDK_HOME
+APP_CFLAGS += -std=c11
+endif
+
 include $(RULES_DIR)/pmsis_rules.mk

From 24ebd9e0d9cf83e20d01edc2bce12a73ef305d56 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 26 Jan 2024 21:12:18 +0100
Subject: [PATCH 50/72] Move stride shift to stride2x2 function

---
 ne16/hal/ne16_task.c | 10 ++++------
 ne16/hal/ne16_task.h |  1 -
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/ne16/hal/ne16_task.c b/ne16/hal/ne16_task.c
index 5ac443c..ae7954e 100644
--- a/ne16/hal/ne16_task.c
+++ b/ne16/hal/ne16_task.c
@@ -52,8 +52,6 @@ void ne16_task_set_op_to_conv(ne16_task_t *task, const uint8_t kernel_shape,
   task->kernel_shape = kernel_shape;
   task->output_channel_throughput = depthwise ? NE16_INPUT_CHANNEL_THROUGHPUT
                                              : NE16_OUTPUT_CHANNEL_THROUGHPUT;
-  task->stride_shift = stride == 2 ? 1 : 0;
-
   const int flag_mode = kernel_shape == 1 ? NE16_FLAG_MODE_1x1
                         : depthwise == 1  ? NE16_FLAG_MODE_3x3_DW
                                           : NE16_FLAG_MODE_3x3;
@@ -128,11 +126,10 @@ void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in,
       .d0 = k_in_stride, .d1 = k_in_stride * w_in_stride, .d2 = 0};
   task->data.cfg.input_stride = input_stride;
 
-  // WARNING: Stride works only for even output channel sizes (divisible by 2)
   const ne16_stride_t output_stride = {
       .d0 = task->out_d0_stride,
-      .d1 = k_out_stride >> task->stride_shift,
-      .d2 = (k_out_stride * w_out_stride) >> task->stride_shift};
+      .d1 = k_out_stride,
+      .d2 = k_out_stride * w_out_stride};
   task->data.cfg.output_stride = output_stride;
 
   if (task->kernel_shape == 1) {
@@ -222,8 +219,9 @@ void ne16_task_set_dims_stride2x2(
     const uint8_t padding_left) {
   const uint8_t stride = 2;
 
+  // WARNING: works only for even output channel stride (divisible by 2)
   ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
-                        k_out_stride);
+                        k_out_stride >> 1);
   ne16_task_set_counters(task, k_in, h_out > 1 ? 3 : 1, w_out > 1 ? 3 : 1,
                          k_out, h_in + padding_top >= 5 ? 0 : padding_bottom,
                          0);
diff --git a/ne16/hal/ne16_task.h b/ne16/hal/ne16_task.h
index 4bacac4..606154e 100644
--- a/ne16/hal/ne16_task.h
+++ b/ne16/hal/ne16_task.h
@@ -113,7 +113,6 @@ typedef struct ne16_task_t {
   uint8_t out_d0_stride;
   uint8_t weight_d0_stride;
   uint8_t qw;
-  uint8_t stride_shift;
   uint8_t output_channel_throughput;
   uint8_t kernel_shape;
   uint8_t depthwise;

From f1ed5f63c39514419fd2abf1142384e723f43e8a Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 26 Jan 2024 21:55:06 +0100
Subject: [PATCH 51/72] Fix flag clear before setting

---
 ne16/hal/ne16_task.c      | 71 +++++++++++++++++++++------------------
 ne16/hal/ne16_task.h      |  8 +++--
 ne16/hal/ne16_task_defs.h | 28 ++++++++++-----
 3 files changed, 63 insertions(+), 44 deletions(-)

diff --git a/ne16/hal/ne16_task.c b/ne16/hal/ne16_task.c
index ae7954e..61aaf49 100644
--- a/ne16/hal/ne16_task.c
+++ b/ne16/hal/ne16_task.c
@@ -23,8 +23,8 @@
 #include "pulp_nnx_util.h"
 
 uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
-                                      uint32_t i_width, uint32_t n_height,
-                                      uint32_t n_width) {
+                               uint32_t i_width, uint32_t n_height,
+                               uint32_t n_width) {
   uint32_t tile_padding = padding;
   if (i_height > 0) {
     tile_padding &= ~(0xf << 28);
@@ -42,8 +42,8 @@ uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
 }
 
 void ne16_task_init(ne16_task_t *task) {
-    *task = (ne16_task_t) { .data = {0} };
-    task->data.cfg.conf0 |= quantMode32Bit;
+  *task = (ne16_task_t){.data = {0}};
+  task->data.cfg.conf0 |= quantMode32Bit;
 }
 
 void ne16_task_set_op_to_conv(ne16_task_t *task, const uint8_t kernel_shape,
@@ -51,13 +51,14 @@ void ne16_task_set_op_to_conv(ne16_task_t *task, const uint8_t kernel_shape,
   task->depthwise = depthwise;
   task->kernel_shape = kernel_shape;
   task->output_channel_throughput = depthwise ? NE16_INPUT_CHANNEL_THROUGHPUT
-                                             : NE16_OUTPUT_CHANNEL_THROUGHPUT;
+                                              : NE16_OUTPUT_CHANNEL_THROUGHPUT;
   const int flag_mode = kernel_shape == 1 ? NE16_FLAG_MODE_1x1
                         : depthwise == 1  ? NE16_FLAG_MODE_3x3_DW
                                           : NE16_FLAG_MODE_3x3;
 
   const int flag_stride2x2 = stride == 2 ? NE16_FLAG_STRIDE_2x2 : 0;
 
+  task->data.cfg.conf0 &= ~(NE16_MASK_FLAG_MODE | NE16_MASK_FLAG_STRIDE_2x2);
   task->data.cfg.conf0 |= flag_mode | flag_stride2x2;
 }
 
@@ -67,22 +68,31 @@ void ne16_task_set_bits(ne16_task_t *task, const uint8_t input_bits,
       input_bits == 16 ? NE16_FLAG_MODE16 : NE16_FLAG_MODE_BASIC;
 
   task->out_d0_stride = 256 / output_bits;
-  task->weight_d0_stride = flag_mode16 ? NE16_WEIGHT_D0_STRIDE_MODE16
-                                       : NE16_WEIGHT_D0_STRIDE_MODE8;
+  task->weight_d0_stride =
+      flag_mode16 ? NE16_WEIGHT_D0_STRIDE_MODE16 : NE16_WEIGHT_D0_STRIDE_MODE8;
   task->qw = weight_bits;
+  task->data.cfg.conf0 &= ~(NE16_MASK_FLAG_MODE16 | NE16_MASK_FLAG_WEIGHT_BITS);
   task->data.cfg.conf0 |= flag_mode16 | (weight_bits - 1);
 }
 
-void ne16_task_set_norm_quant(ne16_task_t *task,
-                    ne16_quant_t quant, ne16_norm_t norm) {
-  task->data.cfg.conf0 |=
-      NE16_FLAG_NORM_QUANT | quant.function | quant.mode |
-      (quant.shift_amount << 16) | quant.flag_rounding << NE16_SHIFT_ROUNDING |
-      norm.mode | norm.flag_bias << NE16_SHIFT_FLAG_NORM_BIAS |
-      norm.flag_shift << NE16_SHIFT_FLAG_NORM_SHIFT;
+void ne16_task_set_norm_quant(ne16_task_t *task, ne16_quant_t quant,
+                              ne16_norm_t norm) {
+  task->data.cfg.conf0 &=
+      ~(NE16_MASK_QUANT_MODE | NE16_MASK_QUANT_FUNCTION |
+        NE16_MASK_SHIFT_AMOUNT | NE16_MASK_FLAG_ROUNDING | NE16_MASK_NORM_MODE |
+        NE16_MASK_FLAG_NORM_BIAS | NE16_MASK_FLAG_NORM_SHIFT);
+  task->data.cfg.conf0 |= NE16_FLAG_NORM_QUANT | quant.function | quant.mode |
+                          (quant.shift_amount << 16) |
+                          quant.flag_rounding << NE16_SHIFT_FLAG_ROUNDING |
+                          norm.mode |
+                          norm.flag_bias << NE16_SHIFT_FLAG_NORM_BIAS |
+                          norm.flag_shift << NE16_SHIFT_FLAG_NORM_SHIFT;
 }
 
-void ne16_task_set_weight_offset(ne16_task_t *task, ne16_weight_offset_mode_e weight_offset_mode, const int32_t weight_offset) {
+void ne16_task_set_weight_offset(ne16_task_t *task,
+                                 ne16_weight_offset_mode_e weight_offset_mode,
+                                 const int32_t weight_offset) {
+  task->data.cfg.conf0 &= ~NE16_MASK_WEIGHT_OFFSET_MODE;
   task->data.cfg.conf0 |= weight_offset_mode;
   task->data.cfg.weight_offset_factor = weight_offset;
 }
@@ -94,18 +104,16 @@ void ne16_task_set_weight_offset(ne16_task_t *task, ne16_weight_offset_mode_e we
  * Necessary for input pointer when it's padded.
  */
 uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width,
-                             const uint32_t channel, const uint8_t bits,
-                             const uint8_t padding_top,
-                             const uint8_t padding_left) {
+                      const uint32_t channel, const uint8_t bits,
+                      const uint8_t padding_top, const uint8_t padding_left) {
   return ptr - (padding_top * width + padding_left) * channel * bits / 8;
 }
 
-void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr,
-                               uint32_t w_in, uint32_t k_in, uint8_t bits_in,
-                               uint8_t padding_top, uint8_t padding_left,
-                               uint32_t output_ptr, uint32_t weights_ptr,
-                               uint32_t scale_ptr, uint32_t shift_ptr,
-                               uint32_t bias_ptr) {
+void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in,
+                        uint32_t k_in, uint8_t bits_in, uint8_t padding_top,
+                        uint8_t padding_left, uint32_t output_ptr,
+                        uint32_t weights_ptr, uint32_t scale_ptr,
+                        uint32_t shift_ptr, uint32_t bias_ptr) {
   task->data.infeat_ptr =
       ne16_pad_ptr(input_ptr, w_in, k_in, bits_in, padding_top, padding_left);
   task->data.outfeat_ptr = output_ptr;
@@ -126,10 +134,9 @@ void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in,
       .d0 = k_in_stride, .d1 = k_in_stride * w_in_stride, .d2 = 0};
   task->data.cfg.input_stride = input_stride;
 
-  const ne16_stride_t output_stride = {
-      .d0 = task->out_d0_stride,
-      .d1 = k_out_stride,
-      .d2 = k_out_stride * w_out_stride};
+  const ne16_stride_t output_stride = {.d0 = task->out_d0_stride,
+                                       .d1 = k_out_stride,
+                                       .d2 = k_out_stride * w_out_stride};
   task->data.cfg.output_stride = output_stride;
 
   if (task->kernel_shape == 1) {
@@ -178,16 +185,16 @@ void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in,
 }
 
 void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
-                                  const uint8_t bottom, const uint8_t left,
-                                  const uint8_t right, const uint8_t value) {
+                           const uint8_t bottom, const uint8_t left,
+                           const uint8_t right, const uint8_t value) {
   task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) |
                            ((bottom & 0xf) << 20) | ((left & 0xf) << 16) |
                            (value & 0xff);
 }
 
 void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
-                                      const uint8_t right, const uint8_t bottom,
-                                      const uint8_t left) {
+                               const uint8_t right, const uint8_t bottom,
+                               const uint8_t left) {
   task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
                                ((bottom & 0xff) << 8) | ((left & 0xff) << 0);
 }
diff --git a/ne16/hal/ne16_task.h b/ne16/hal/ne16_task.h
index 606154e..ae27623 100644
--- a/ne16/hal/ne16_task.h
+++ b/ne16/hal/ne16_task.h
@@ -124,9 +124,11 @@ void ne16_task_set_op_to_conv(ne16_task_t *task, const uint8_t kernel_shape,
                               const uint8_t depthwise, const uint8_t stride);
 void ne16_task_set_bits(ne16_task_t *task, const uint8_t input_bits,
                         const uint8_t output_bits, const uint8_t weight_bits);
-void ne16_task_set_norm_quant(ne16_task_t *task,
-                    ne16_quant_t quant, ne16_norm_t norm);
-void ne16_task_set_weight_offset(ne16_task_t *task, ne16_weight_offset_mode_e weight_offset_mode, const int32_t weight_offset);
+void ne16_task_set_norm_quant(ne16_task_t *task, ne16_quant_t quant,
+                              ne16_norm_t norm);
+void ne16_task_set_weight_offset(ne16_task_t *task,
+                                 ne16_weight_offset_mode_e weight_offset_mode,
+                                 const int32_t weight_offset);
 uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
                                uint32_t i_width, uint32_t n_height,
                                uint32_t n_width);
diff --git a/ne16/hal/ne16_task_defs.h b/ne16/hal/ne16_task_defs.h
index 803e30e..df5cd4c 100644
--- a/ne16/hal/ne16_task_defs.h
+++ b/ne16/hal/ne16_task_defs.h
@@ -59,12 +59,6 @@
 #define NE16_REG_FILTER_MASKING 22
 #define NE16_REG_CONF0 23
 
-/*  SHIFT  */
-
-#define NE16_SHIFT_FLAG_NORM_BIAS (25)
-#define NE16_SHIFT_FLAG_NORM_SHIFT (24)
-#define NE16_SHIFT_ROUNDING (11)
-
 /*  CONF0 FLAGS */
 
 #define NE16_FLAG_NORM_BIAS (1 << 25)
@@ -81,7 +75,7 @@
 #define NE16_NORM_MODE_8BIT (0 << 12)
 #define NE16_NORM_MODE_16BIT (1 << 12)
 #define NE16_NORM_MODE_32BIT (2 << 12)
-#define NE16_FLAG_ROUND (1 << 11)
+#define NE16_FLAG_ROUNDING (1 << 11)
 #define NE16_FLAG_STRIDE_2x2 (1 << 8)
 #define NE16_FLAG_LINEAR_MODE (1 << 7)
 #define NE16_FLAG_MODE_3x3 (0 << 5)
@@ -91,10 +85,26 @@
 #define NE16_FLAG_MODE_BASIC (0 << 3)
 #define NE16_FLAG_MODE16 (1 << 3)
 
+/*  SHIFT  */
+
+#define NE16_SHIFT_FLAG_NORM_BIAS (25)
+#define NE16_SHIFT_FLAG_NORM_SHIFT (24)
+#define NE16_SHIFT_FLAG_ROUNDING (11)
+
 /* Masks */
 
-#define NE16_MASK_QUANT_FUNCTION (1 << 23)
-#define NE16_MASK_QUANT_MODE (3 << 21)
+#define NE16_MASK_FLAG_NORM_BIAS (0x1 << 25)
+#define NE16_MASK_FLAG_NORM_SHIFT (0x1 << 24)
+#define NE16_MASK_QUANT_FUNCTION (0x1 << 23)
+#define NE16_MASK_QUANT_MODE (0x3 << 21)
+#define NE16_MASK_SHIFT_AMOUNT (0x1f << 16)
+#define NE16_MASK_WEIGHT_OFFSET_MODE (0x1 << 15)
+#define NE16_MASK_NORM_MODE (0x3 << 12)
+#define NE16_MASK_FLAG_ROUNDING (0x1 << 11)
+#define NE16_MASK_FLAG_STRIDE_2x2 (0x1 << 8)
+#define NE16_MASK_FLAG_MODE (0x3 << 5)
+#define NE16_MASK_FLAG_MODE16 (0x1 << 3)
+#define NE16_MASK_FLAG_WEIGHT_BITS (0x7 << 0)
 
 /* PADDING */
 

From c47b2c549ac22fbc46ee5d8f7ae8302a3dac2b52 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 26 Jan 2024 21:55:45 +0100
Subject: [PATCH 52/72] Fix normMode hardcoded to 32bit

---
 test/app/src/nnx_layer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index 540797d..b12ccba 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -125,10 +125,11 @@ static void task_prepare(nnx_task_t *task) {
                     .function =
                         HAS_RELU ? quantFunctionRelu : quantFunctionIdentity,
                     .flag_rounding = nnxTaskFlagFalse},
-      (nnx_norm_t){.mode = normMode32Bit,
+      (nnx_norm_t){.mode = normMode,
                    .flag_bias = HAS_BIAS ? nnxTaskFlagTrue : nnxTaskFlagFalse,
                    .flag_shift = nnxTaskFlagFalse});
 #endif // HAS_NORM_QUANT
+  //
   ne16_task_set_weight_offset(task, weightOffsetModeLayerWise, WEIGHT_OFFSET);
 
   const uint32_t k_in_stride = INPUT_CHANNEL * INPUT_BITS / 8;

From 27ab3a53cacebc33f481204df52bb5798676e7d5 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Fri, 26 Jan 2024 22:21:34 +0100
Subject: [PATCH 53/72] Set quantMode in *_set_bits function

---
 ne16/hal/ne16_task.c     | 34 ++++++++++++++++++++--------------
 ne16/hal/ne16_task.h     |  1 -
 test/app/src/nnx_layer.c |  6 ------
 3 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/ne16/hal/ne16_task.c b/ne16/hal/ne16_task.c
index 61aaf49..98897fc 100644
--- a/ne16/hal/ne16_task.c
+++ b/ne16/hal/ne16_task.c
@@ -41,10 +41,7 @@ uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
   return tile_padding;
 }
 
-void ne16_task_init(ne16_task_t *task) {
-  *task = (ne16_task_t){.data = {0}};
-  task->data.cfg.conf0 |= quantMode32Bit;
-}
+void ne16_task_init(ne16_task_t *task) { *task = (ne16_task_t){.data = {0}}; }
 
 void ne16_task_set_op_to_conv(ne16_task_t *task, const uint8_t kernel_shape,
                               const uint8_t depthwise, const uint8_t stride) {
@@ -67,26 +64,35 @@ void ne16_task_set_bits(ne16_task_t *task, const uint8_t input_bits,
   const uint32_t flag_mode16 =
       input_bits == 16 ? NE16_FLAG_MODE16 : NE16_FLAG_MODE_BASIC;
 
+  ne16_quant_mode_e quantMode;
+  if (output_bits == 16) {
+    quantMode = quantMode16Bit;
+  } else if (output_bits == 8) {
+    quantMode = quantMode8Bit;
+  } else {
+    quantMode = quantMode32Bit;
+  }
+
   task->out_d0_stride = 256 / output_bits;
   task->weight_d0_stride =
       flag_mode16 ? NE16_WEIGHT_D0_STRIDE_MODE16 : NE16_WEIGHT_D0_STRIDE_MODE8;
   task->qw = weight_bits;
-  task->data.cfg.conf0 &= ~(NE16_MASK_FLAG_MODE16 | NE16_MASK_FLAG_WEIGHT_BITS);
-  task->data.cfg.conf0 |= flag_mode16 | (weight_bits - 1);
+  task->data.cfg.conf0 &= ~(NE16_MASK_QUANT_MODE | NE16_MASK_FLAG_MODE16 |
+                            NE16_MASK_FLAG_WEIGHT_BITS);
+  task->data.cfg.conf0 |= quantMode | flag_mode16 | (weight_bits - 1);
 }
 
 void ne16_task_set_norm_quant(ne16_task_t *task, ne16_quant_t quant,
                               ne16_norm_t norm) {
   task->data.cfg.conf0 &=
-      ~(NE16_MASK_QUANT_MODE | NE16_MASK_QUANT_FUNCTION |
-        NE16_MASK_SHIFT_AMOUNT | NE16_MASK_FLAG_ROUNDING | NE16_MASK_NORM_MODE |
+      ~(NE16_MASK_QUANT_FUNCTION | NE16_MASK_SHIFT_AMOUNT |
+        NE16_MASK_FLAG_ROUNDING | NE16_MASK_NORM_MODE |
         NE16_MASK_FLAG_NORM_BIAS | NE16_MASK_FLAG_NORM_SHIFT);
-  task->data.cfg.conf0 |= NE16_FLAG_NORM_QUANT | quant.function | quant.mode |
-                          (quant.shift_amount << 16) |
-                          quant.flag_rounding << NE16_SHIFT_FLAG_ROUNDING |
-                          norm.mode |
-                          norm.flag_bias << NE16_SHIFT_FLAG_NORM_BIAS |
-                          norm.flag_shift << NE16_SHIFT_FLAG_NORM_SHIFT;
+  task->data.cfg.conf0 |=
+      NE16_FLAG_NORM_QUANT | quant.function | (quant.shift_amount << 16) |
+      quant.flag_rounding << NE16_SHIFT_FLAG_ROUNDING | norm.mode |
+      norm.flag_bias << NE16_SHIFT_FLAG_NORM_BIAS |
+      norm.flag_shift << NE16_SHIFT_FLAG_NORM_SHIFT;
 }
 
 void ne16_task_set_weight_offset(ne16_task_t *task,
diff --git a/ne16/hal/ne16_task.h b/ne16/hal/ne16_task.h
index ae27623..6044133 100644
--- a/ne16/hal/ne16_task.h
+++ b/ne16/hal/ne16_task.h
@@ -60,7 +60,6 @@ typedef enum ne16_quant_function_e {
 typedef struct ne16_quant_t {
   // Shift amount must be in range 0x00-0x1F
   unsigned shift_amount;
-  ne16_quant_mode_e mode;
   ne16_quant_function_e function;
   int flag_rounding;
 } ne16_quant_t;
diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index b12ccba..82b6b4c 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -107,11 +107,6 @@ static void task_prepare(nnx_task_t *task) {
   ne16_task_set_bits(task, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS);
 
 #if HAS_NORM_QUANT == 1
-#if OUTPUT_BITS == 8
-  const ne16_quant_mode_e quantMode = quantMode8Bit;
-#elif OUTPUT_BITS == 32
-  const ne16_quant_mode_e quantMode = quantMode32Bit;
-#endif
 #if SCALE_BITS == 8
   const ne16_norm_mode_e normMode = normMode8Bit;
 #elif SCALE_BITS == 32
@@ -121,7 +116,6 @@ static void task_prepare(nnx_task_t *task) {
   ne16_task_set_norm_quant(
       task,
       (nnx_quant_t){.shift_amount = OUTSHIFT,
-                    .mode = quantMode,
                     .function =
                         HAS_RELU ? quantFunctionRelu : quantFunctionIdentity,
                     .flag_rounding = nnxTaskFlagFalse},

From 23009ccdbd4f0175d17efea0b63d2fcd424db1c4 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Sat, 27 Jan 2024 07:00:02 +0100
Subject: [PATCH 54/72] Fix output d0 stride and rename defs

---
 ne16/hal/ne16_task.c      | 27 +++++++++++++--------------
 ne16/hal/ne16_task.h      |  3 +--
 ne16/hal/ne16_task_defs.h |  9 +++++++--
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/ne16/hal/ne16_task.c b/ne16/hal/ne16_task.c
index 98897fc..629f264 100644
--- a/ne16/hal/ne16_task.c
+++ b/ne16/hal/ne16_task.c
@@ -47,8 +47,8 @@ void ne16_task_set_op_to_conv(ne16_task_t *task, const uint8_t kernel_shape,
                               const uint8_t depthwise, const uint8_t stride) {
   task->depthwise = depthwise;
   task->kernel_shape = kernel_shape;
-  task->output_channel_throughput = depthwise ? NE16_INPUT_CHANNEL_THROUGHPUT
-                                              : NE16_OUTPUT_CHANNEL_THROUGHPUT;
+  task->subtile_output_channel =
+      depthwise ? NE16_SUBTILE_INPUT_CHANNEL : NE16_SUBTILE_OUTPUT_CHANNEL;
   const int flag_mode = kernel_shape == 1 ? NE16_FLAG_MODE_1x1
                         : depthwise == 1  ? NE16_FLAG_MODE_3x3_DW
                                           : NE16_FLAG_MODE_3x3;
@@ -73,7 +73,6 @@ void ne16_task_set_bits(ne16_task_t *task, const uint8_t input_bits,
     quantMode = quantMode32Bit;
   }
 
-  task->out_d0_stride = 256 / output_bits;
   task->weight_d0_stride =
       flag_mode16 ? NE16_WEIGHT_D0_STRIDE_MODE16 : NE16_WEIGHT_D0_STRIDE_MODE8;
   task->qw = weight_bits;
@@ -134,13 +133,13 @@ void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in,
                            const uint32_t k_in_stride,
                            const uint32_t w_out_stride,
                            const uint32_t k_out_stride) {
-  const uint32_t num_k_in = divnceil(k_in, NE16_INPUT_CHANNEL_THROUGHPUT);
+  const uint32_t num_k_in = divnceil(k_in, NE16_SUBTILE_INPUT_CHANNEL);
 
   const ne16_stride_t input_stride = {
       .d0 = k_in_stride, .d1 = k_in_stride * w_in_stride, .d2 = 0};
   task->data.cfg.input_stride = input_stride;
 
-  const ne16_stride_t output_stride = {.d0 = task->out_d0_stride,
+  const ne16_stride_t output_stride = {.d0 = NE16_OUTPUT_BANDWIDTH_BYTES,
                                        .d1 = k_out_stride,
                                        .d2 = k_out_stride * w_out_stride};
   task->data.cfg.output_stride = output_stride;
@@ -167,15 +166,15 @@ void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in,
                             const uint32_t h_out, const uint32_t w_out,
                             const uint32_t k_out, const uint8_t padding_bottom,
                             const uint8_t padding_right) {
-  const uint16_t num_Ko = divnceil(k_out, task->output_channel_throughput);
-  const uint16_t num_Ki = divnceil(k_in, NE16_INPUT_CHANNEL_THROUGHPUT);
-  const uint16_t num_Ho = divnceil(h_out, NE16_FILTER_SIZE);
-  const uint16_t num_Wo = divnceil(w_out, NE16_FILTER_SIZE);
-
-  const uint16_t rem_Ko = remainder(k_out, task->output_channel_throughput);
-  const uint16_t rem_Ki = remainder(k_in, NE16_INPUT_CHANNEL_THROUGHPUT);
-  const uint16_t rem_Ho = remainder(h_out, NE16_FILTER_SIZE);
-  const uint16_t rem_Wo = remainder(w_out, NE16_FILTER_SIZE);
+  const uint16_t num_Ko = divnceil(k_out, task->subtile_output_channel);
+  const uint16_t num_Ki = divnceil(k_in, NE16_SUBTILE_INPUT_CHANNEL);
+  const uint16_t num_Ho = divnceil(h_out, NE16_SUBTILE_OUTPUT_HEIGHT);
+  const uint16_t num_Wo = divnceil(w_out, NE16_SUBTILE_OUTPUT_WIDTH);
+
+  const uint16_t rem_Ko = remainder(k_out, task->subtile_output_channel);
+  const uint16_t rem_Ki = remainder(k_in, NE16_SUBTILE_INPUT_CHANNEL);
+  const uint16_t rem_Ho = remainder(h_out, NE16_SUBTILE_OUTPUT_HEIGHT);
+  const uint16_t rem_Wo = remainder(w_out, NE16_SUBTILE_OUTPUT_WIDTH);
   const uint16_t rem_Hi =
       (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom;
   const uint16_t rem_Wi =
diff --git a/ne16/hal/ne16_task.h b/ne16/hal/ne16_task.h
index 6044133..557e2a0 100644
--- a/ne16/hal/ne16_task.h
+++ b/ne16/hal/ne16_task.h
@@ -109,10 +109,9 @@ typedef struct ne16_task_data_t {
 
 typedef struct ne16_task_t {
   ne16_task_data_t data;
-  uint8_t out_d0_stride;
   uint8_t weight_d0_stride;
   uint8_t qw;
-  uint8_t output_channel_throughput;
+  uint8_t subtile_output_channel;
   uint8_t kernel_shape;
   uint8_t depthwise;
   uint8_t id;
diff --git a/ne16/hal/ne16_task_defs.h b/ne16/hal/ne16_task_defs.h
index df5cd4c..d3d7297 100644
--- a/ne16/hal/ne16_task_defs.h
+++ b/ne16/hal/ne16_task_defs.h
@@ -25,8 +25,13 @@
 
 #define NE16_FILTER_SIZE (3)
 #define NE16_FILTER_BUFFER_SIZE (5)
-#define NE16_INPUT_CHANNEL_THROUGHPUT (16)
-#define NE16_OUTPUT_CHANNEL_THROUGHPUT (32)
+#define NE16_SUBTILE_INPUT_HEIGHT (5)
+#define NE16_SUBTILE_INPUT_WIDTH (5)
+#define NE16_SUBTILE_INPUT_CHANNEL (16)
+#define NE16_SUBTILE_OUTPUT_HEIGHT (3)
+#define NE16_SUBTILE_OUTPUT_WIDTH (3)
+#define NE16_SUBTILE_OUTPUT_CHANNEL (32)
+#define NE16_OUTPUT_BANDWIDTH_BYTES (32)
 
 #define NE16_WEIGHT_D0_STRIDE_MODE8 (2)
 #define NE16_WEIGHT_D0_STRIDE_MODE16 (1)

From e78dd80edeaa96f6306fd319139899ce18c80397 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Sat, 27 Jan 2024 10:06:58 +0100
Subject: [PATCH 55/72] Fixes to strides and stride2x2

Fixed stride2x2 mode for 32bit output.
Changed stride meaning from number of elements in that dimension, to
number of bytes between elements in that dimension. This also required a
change of dimension names.
---
 inc/pulp_nnx_ne16.h           | 11 +++++-----
 ne16/hal/ne16_task.c          | 32 ++++++++++++++--------------
 ne16/hal/ne16_task.h          | 36 ++++++++++++++++++++++++--------
 src/pulp_nnx_ne16.c           | 39 +++++++++++++++--------------------
 test/app/src/nnx_layer.c      | 23 +++++++++++----------
 test/tests/test_106/conf.json | 29 ++++++++++++++++++++++++++
 test/tests/test_107/conf.json | 29 ++++++++++++++++++++++++++
 test/tests/test_108/conf.json | 29 ++++++++++++++++++++++++++
 test/tests/test_109/conf.json | 29 ++++++++++++++++++++++++++
 test/tests/test_110/conf.json | 29 ++++++++++++++++++++++++++
 test/tests/test_111/conf.json | 29 ++++++++++++++++++++++++++
 test/tests/test_112/conf.json | 29 ++++++++++++++++++++++++++
 test/tests/test_113/conf.json | 29 ++++++++++++++++++++++++++
 test/tests/test_114/conf.json | 29 ++++++++++++++++++++++++++
 test/tests/test_115/conf.json | 29 ++++++++++++++++++++++++++
 15 files changed, 367 insertions(+), 64 deletions(-)
 create mode 100644 test/tests/test_106/conf.json
 create mode 100644 test/tests/test_107/conf.json
 create mode 100644 test/tests/test_108/conf.json
 create mode 100644 test/tests/test_109/conf.json
 create mode 100644 test/tests/test_110/conf.json
 create mode 100644 test/tests/test_111/conf.json
 create mode 100644 test/tests/test_112/conf.json
 create mode 100644 test/tests/test_113/conf.json
 create mode 100644 test/tests/test_114/conf.json
 create mode 100644 test/tests/test_115/conf.json

diff --git a/inc/pulp_nnx_ne16.h b/inc/pulp_nnx_ne16.h
index 7bbda6d..97e6e2e 100644
--- a/inc/pulp_nnx_ne16.h
+++ b/inc/pulp_nnx_ne16.h
@@ -69,9 +69,8 @@ void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task);
  * tile the tile to the subtile's spatial dimensions (in this case 3x3 output).
  * Works only if the k_out is divisible by 2.
  */
-void ne16_nnx_dispatch_stride2x2(
-    ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in,
-    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
-    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
-    const uint8_t h_ker, const uint8_t w_ker);
+void ne16_nnx_dispatch_stride2x2(ne16_dev_t *dev, ne16_task_t *task,
+                                 const uint32_t w_in, const uint32_t k_in,
+                                 const uint32_t h_out, const uint32_t w_out,
+                                 const uint32_t k_out, const uint8_t h_ker,
+                                 const uint8_t w_ker);
diff --git a/ne16/hal/ne16_task.c b/ne16/hal/ne16_task.c
index 629f264..21518a7 100644
--- a/ne16/hal/ne16_task.c
+++ b/ne16/hal/ne16_task.c
@@ -129,19 +129,19 @@ void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in,
 }
 
 void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in,
+                           const uint32_t h_in_stride,
                            const uint32_t w_in_stride,
-                           const uint32_t k_in_stride,
-                           const uint32_t w_out_stride,
-                           const uint32_t k_out_stride) {
+                           const uint32_t h_out_stride,
+                           const uint32_t w_out_stride) {
   const uint32_t num_k_in = divnceil(k_in, NE16_SUBTILE_INPUT_CHANNEL);
 
   const ne16_stride_t input_stride = {
-      .d0 = k_in_stride, .d1 = k_in_stride * w_in_stride, .d2 = 0};
+      .d0 = w_in_stride, .d1 = h_in_stride, .d2 = 0};
   task->data.cfg.input_stride = input_stride;
 
   const ne16_stride_t output_stride = {.d0 = NE16_OUTPUT_BANDWIDTH_BYTES,
-                                       .d1 = k_out_stride,
-                                       .d2 = k_out_stride * w_out_stride};
+                                       .d1 = w_out_stride,
+                                       .d2 = h_out_stride};
   task->data.cfg.output_stride = output_stride;
 
   if (task->kernel_shape == 1) {
@@ -205,16 +205,16 @@ void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
 }
 
 void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
-                        const uint32_t k_in, const uint32_t w_in_stride,
-                        const uint32_t k_in_stride, const uint32_t h_out,
+                        const uint32_t k_in, const uint32_t h_in_stride,
+                        const uint32_t w_in_stride, const uint32_t h_out,
                         const uint32_t w_out, const uint32_t k_out,
-                        const uint32_t w_out_stride,
-                        const uint32_t k_out_stride, const uint8_t padding_top,
+                        const uint32_t h_out_stride,
+                        const uint32_t w_out_stride, const uint8_t padding_top,
                         const uint8_t padding_bottom,
                         const uint8_t padding_right,
                         const uint8_t padding_left) {
-  ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
-                        k_out_stride);
+  ne16_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride,
+                        w_out_stride);
   ne16_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,
                          padding_right);
   ne16_task_set_padding(task, padding_top, padding_bottom, padding_left,
@@ -223,17 +223,17 @@ void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
 
 void ne16_task_set_dims_stride2x2(
     ne16_task_t *task, const uint32_t h_in, const uint32_t w_in,
-    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t k_in, const uint32_t h_in_stride, const uint32_t w_in_stride,
     const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint32_t h_out_stride, const uint32_t w_out_stride,
     const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
     const uint8_t padding_bottom, const uint8_t padding_right,
     const uint8_t padding_left) {
   const uint8_t stride = 2;
 
   // WARNING: works only for even output channel stride (divisible by 2)
-  ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
-                        k_out_stride >> 1);
+  ne16_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride >> 1,
+                        w_out_stride >> 1);
   ne16_task_set_counters(task, k_in, h_out > 1 ? 3 : 1, w_out > 1 ? 3 : 1,
                          k_out, h_in + padding_top >= 5 ? 0 : padding_bottom,
                          0);
diff --git a/ne16/hal/ne16_task.h b/ne16/hal/ne16_task.h
index 557e2a0..dd12c39 100644
--- a/ne16/hal/ne16_task.h
+++ b/ne16/hal/ne16_task.h
@@ -138,11 +138,17 @@ void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in,
                         uint8_t padding_left, uint32_t output_ptr,
                         uint32_t weights_ptr, uint32_t scale_ptr,
                         uint32_t shift_ptr, uint32_t bias_ptr);
+/** ne16_task_set_strides
+ *
+ * All the strides variables are strides between elements alongside that
+ * dimension and expressed in bytes. There is no stride variable for the channel
+ * dimension because the NE16 requires the channels to be contiguous.
+ */
 void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in,
+                           const uint32_t h_in_stride,
                            const uint32_t w_in_stride,
-                           const uint32_t k_in_stride,
-                           const uint32_t w_out_stride,
-                           const uint32_t k_out_stride);
+                           const uint32_t h_out_stride,
+                           const uint32_t w_out_stride);
 void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in,
                             const uint32_t h_out, const uint32_t w_out,
                             const uint32_t k_out, const uint8_t padding_bottom,
@@ -153,20 +159,32 @@ void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
 void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
                                const uint8_t right, const uint8_t bottom,
                                const uint8_t left);
+/** ne16_task_set_dims
+ *
+ * All the strides variables are strides between elements alongside that
+ * dimension and expressed in bytes. There is no stride variable for the channel
+ * dimension because the NE16 requires the channels to be contiguous.
+ */
 void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
-                        const uint32_t k_in, const uint32_t w_in_stride,
-                        const uint32_t k_in_stride, const uint32_t h_out,
+                        const uint32_t k_in, const uint32_t h_in_stride,
+                        const uint32_t w_in_stride, const uint32_t h_out,
                         const uint32_t w_out, const uint32_t k_out,
-                        const uint32_t w_out_stride,
-                        const uint32_t k_out_stride, const uint8_t padding_top,
+                        const uint32_t h_out_stride,
+                        const uint32_t w_out_stride, const uint8_t padding_top,
                         const uint8_t padding_bottom,
                         const uint8_t padding_right,
                         const uint8_t padding_left);
+/** ne16_task_set_dims_stride2x2
+ *
+ * All the strides variables are strides between elements alongside that
+ * dimension and expressed in bytes. There is no stride variable for the channel
+ * dimension because the NE16 requires the channels to be contiguous.
+ */
 void ne16_task_set_dims_stride2x2(
     ne16_task_t *task, const uint32_t h_in, const uint32_t w_in,
-    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t k_in, const uint32_t h_in_stride, const uint32_t w_in_stride,
     const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint32_t h_out_stride, const uint32_t w_out_stride,
     const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
     const uint8_t padding_bottom, const uint8_t padding_right,
     const uint8_t padding_left);
diff --git a/src/pulp_nnx_ne16.c b/src/pulp_nnx_ne16.c
index 6417b07..99a2c9c 100644
--- a/src/pulp_nnx_ne16.c
+++ b/src/pulp_nnx_ne16.c
@@ -79,22 +79,17 @@ static inline uint32_t _get_tile_ptr(uint32_t ptr, int i, int j, int size_i,
                                      uint32_t size_j, uint32_t size_k,
                                      uint32_t stride_j, uint32_t stride_k,
                                      uint32_t overlap_i, uint32_t overlap_j,
-                                     uint32_t offset_i, uint32_t offset_j,
-                                     uint8_t data_size) {
-  return ptr +
-         (i * (size_i - overlap_i) - offset_i) * stride_j * stride_k *
-             data_size / 8 +
-         (j * (size_j - overlap_j) - offset_j) * stride_k * data_size / 8;
+                                     uint32_t offset_i, uint32_t offset_j) {
+  return ptr + (i * (size_i - overlap_i) - offset_i) * stride_j +
+         (j * (size_j - overlap_j) - offset_j) * stride_k;
 }
 
-void ne16_nnx_dispatch_stride2x2(
-    ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in,
-    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
-    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
-    const uint8_t h_ker, const uint8_t w_ker) {
+void ne16_nnx_dispatch_stride2x2(ne16_dev_t *dev, ne16_task_t *task,
+                                 const uint32_t w_in, const uint32_t k_in,
+                                 const uint32_t h_out, const uint32_t w_out,
+                                 const uint32_t k_out, const uint8_t h_ker,
+                                 const uint8_t w_ker) {
   const uint8_t stride = 2;
-  const uint8_t bits = 8;
 
   const uint32_t n_h = divnceil(h_out, stride);
   const uint32_t n_w = divnceil(w_out, stride);
@@ -109,15 +104,15 @@ void ne16_nnx_dispatch_stride2x2(
 
   for (int i = 0; i < n_h; i++) {
     for (int j = 0; j < n_w; j++) {
-      task->data.infeat_ptr =
-          _get_tile_ptr(input_base, i, j, 3 + h_ker - 1, 3 + w_ker - 1, k_in,
-                        w_in_stride, k_in_stride, h_ker - stride,
-                        w_ker - stride, i == 0 ? 0 : input_height_offset,
-                        j == 0 ? 0 : input_width_offset, bits);
-      task->data.outfeat_ptr =
-          _get_tile_ptr(output_base, i, j, 2, 2, k_out, w_out_stride,
-                        k_out_stride, 0, 0, i == 0 ? 0 : output_height_offset,
-                        j == 0 ? 0 : output_width_offset, bits);
+      task->data.infeat_ptr = _get_tile_ptr(
+          input_base, i, j, 3 + h_ker - 1, 3 + w_ker - 1, k_in,
+          task->data.cfg.input_stride.d1, task->data.cfg.input_stride.d0,
+          h_ker - stride, w_ker - stride, i == 0 ? 0 : input_height_offset,
+          j == 0 ? 0 : input_width_offset);
+      task->data.outfeat_ptr = _get_tile_ptr(
+          output_base, i, j, 2, 2, k_out, task->data.cfg.output_stride.d2 << 1,
+          task->data.cfg.output_stride.d1 << 1, 0, 0,
+          i == 0 ? 0 : output_height_offset, j == 0 ? 0 : output_width_offset);
 
       task->data.cfg.padding =
           ne16_get_tile_padding(tile_padding, i, j, n_h, n_w);
diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index 82b6b4c..893f2fc 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -126,19 +126,21 @@ static void task_prepare(nnx_task_t *task) {
   //
   ne16_task_set_weight_offset(task, weightOffsetModeLayerWise, WEIGHT_OFFSET);
 
-  const uint32_t k_in_stride = INPUT_CHANNEL * INPUT_BITS / 8;
-  const uint32_t k_out_stride = OUTPUT_CHANNEL * OUTPUT_BITS / 8;
+  const uint32_t w_in_stride = INPUT_CHANNEL * INPUT_BITS / 8;
+  const uint32_t h_in_stride = INPUT_WIDTH * w_in_stride;
+  const uint32_t w_out_stride = OUTPUT_CHANNEL * OUTPUT_BITS / 8;
+  const uint32_t h_out_stride = OUTPUT_WIDTH * w_out_stride;
 
 #if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2
   nnx_task_set_dims_stride2x2(
-      task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH, k_in_stride,
-      OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, OUTPUT_WIDTH, k_out_stride,
+      task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, h_in_stride, w_in_stride,
+      OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, h_out_stride, w_out_stride,
       WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP, PADDING_BOTTOM, PADDING_RIGHT,
       PADDING_LEFT);
 #else
-  nnx_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH, k_in_stride,
-                    OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, OUTPUT_WIDTH,
-                    k_out_stride, PADDING_TOP, PADDING_BOTTOM, PADDING_RIGHT,
+  nnx_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, h_in_stride, w_in_stride,
+                    OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, h_out_stride,
+                    w_out_stride, PADDING_TOP, PADDING_BOTTOM, PADDING_RIGHT,
                     PADDING_LEFT);
 #endif
 
@@ -169,10 +171,9 @@ static void task_execute(nnx_task_t *task) {
   nnx_dispatch_wait(dev);
 
 #if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2
-  nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
-                         INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
-                         OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL,
-                         WEIGHT_HEIGHT, WEIGHT_WIDTH);
+  nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, OUTPUT_HEIGHT,
+                         OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT,
+                         WEIGHT_WIDTH);
 #else
   nnx_dispatch(dev, task);
 #endif
diff --git a/test/tests/test_106/conf.json b/test/tests/test_106/conf.json
new file mode 100644
index 0000000..0b98f3a
--- /dev/null
+++ b/test/tests/test_106/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 17,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_107/conf.json b/test/tests/test_107/conf.json
new file mode 100644
index 0000000..2f8951c
--- /dev/null
+++ b/test/tests/test_107/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 17,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 1,
+        "width": 1
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_108/conf.json b/test/tests/test_108/conf.json
new file mode 100644
index 0000000..7842aaa
--- /dev/null
+++ b/test/tests/test_108/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 33,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": true,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_109/conf.json b/test/tests/test_109/conf.json
new file mode 100644
index 0000000..a6b71c9
--- /dev/null
+++ b/test/tests/test_109/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 33,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": true,
+    "stride": {
+        "height": 2,
+        "width": 2
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_110/conf.json b/test/tests/test_110/conf.json
new file mode 100644
index 0000000..622efc4
--- /dev/null
+++ b/test/tests/test_110/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 33,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 2,
+        "width": 2
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_111/conf.json b/test/tests/test_111/conf.json
new file mode 100644
index 0000000..d6714c4
--- /dev/null
+++ b/test/tests/test_111/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 33,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 1,
+        "width": 1
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 2,
+        "width": 2
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_112/conf.json b/test/tests/test_112/conf.json
new file mode 100644
index 0000000..1991c59
--- /dev/null
+++ b/test/tests/test_112/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 33,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 1,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_113/conf.json b/test/tests/test_113/conf.json
new file mode 100644
index 0000000..1dce097
--- /dev/null
+++ b/test/tests/test_113/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 33,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 1
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_114/conf.json b/test/tests/test_114/conf.json
new file mode 100644
index 0000000..c1ce5c3
--- /dev/null
+++ b/test/tests/test_114/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 33,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 1,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_115/conf.json b/test/tests/test_115/conf.json
new file mode 100644
index 0000000..19153ba
--- /dev/null
+++ b/test/tests/test_115/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 33,
+    "out_channel": 33,
+    "padding": {
+        "top": 1,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file

From a6f142a9fbe4b0c65628d1f984c9dabe64dc3d5c Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Sat, 27 Jan 2024 15:14:19 +0100
Subject: [PATCH 56/72] Add no norm_quant to neureka and all the fixes too

---
 neureka/hal/neureka_task.c      | 178 +++++++++++++++++++-------------
 neureka/hal/neureka_task.h      |  58 +++++++----
 neureka/hal/neureka_task_defs.h |  48 +++++----
 test/NeurekaTestConf.py         |   2 +-
 test/app/src/nnx_layer.c        |  37 +++++--
 5 files changed, 206 insertions(+), 117 deletions(-)

diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
index 2c8823c..35d0745 100644
--- a/neureka/hal/neureka_task.c
+++ b/neureka/hal/neureka_task.c
@@ -23,8 +23,8 @@
 #include "pulp_nnx_util.h"
 
 uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
-                                         uint32_t i_width, uint32_t n_height,
-                                         uint32_t n_width) {
+                                  uint32_t i_width, uint32_t n_height,
+                                  uint32_t n_width) {
   uint32_t tile_padding = padding;
   if (i_height > 0) {
     tile_padding &= ~(0xf << 28);
@@ -41,40 +41,78 @@ uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
   return tile_padding;
 }
 
-void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
-                       const uint8_t depthwise, const uint8_t input_bits,
-                       const uint8_t output_bits, const uint8_t weights_bits,
-                       const neureka_weight_offset_mode_e weights_offset_mode,
-                       const uint32_t weights_offset_factor,
-                       neureka_quant_t quant, neureka_norm_t norm,
-                       const uint8_t flag_input_signed) {
-  *task = (neureka_task_t){.outbytes = output_bits / 8,
-                           .qw = weights_bits,
-                           .output_channel_throughput =
-                               depthwise ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3
-                                         : NEUREKA_OUTPUT_CHANNEL_THROUGHPUT,
-                           .input_channel_throughput =
-                               kernel_shape == 3
-                                   ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3
-                                   : NEUREKA_INPUT_CHANNEL_THROUGHPUT_1x1,
-                           .kernel_shape = kernel_shape,
-                           .depthwise = depthwise,
-                           .data = {0}};
+void neureka_task_init(neureka_task_t *task) {
+  *task = (neureka_task_t){.data = {0}};
+}
+
+void neureka_task_set_op_to_conv(neureka_task_t *task,
+                                 const uint8_t kernel_shape,
+                                 const uint8_t depthwise,
+                                 const uint8_t stride) {
+  task->depthwise = depthwise;
+  task->kernel_shape = kernel_shape;
+  task->subtile_output_channel = depthwise ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3
+                                           : NEUREKA_SUBTILE_OUTPUT_CHANNEL;
+  task->subtile_input_channel = kernel_shape == 3
+                                    ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3
+                                    : NEUREKA_SUBTILE_INPUT_CHANNEL_1x1;
 
   const int flag_mode = kernel_shape == 1 ? NEUREKA_FLAG_MODE_1x1
                         : depthwise == 1  ? NEUREKA_FLAG_MODE_3x3_DW
                                           : NEUREKA_FLAG_MODE_3x3;
 
-  task->data.cfg.conf0 |=
-      flag_input_signed << NEUREKA_SHIFT_FLAG_INPUT_SIGNED |
-      NEUREKA_FLAG_NORM_QUANT | quant.function | quant.mode |
-      (quant.shift_amount << 16) |
-      quant.flag_rounding << NEUREKA_SHIFT_ROUNDING | norm.mode |
-      norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS |
-      norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT | NEUREKA_FLAG_USE_TCDM |
-      weights_offset_mode | flag_mode | (weights_bits - 1);
+  task->data.cfg.conf0 &= ~(NEUREKA_MASK_FLAG_MODE);
+  task->data.cfg.conf0 |= flag_mode;
+}
+
+void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits,
+                           const uint8_t output_bits,
+                           const uint8_t weight_bits) {
+  neureka_quant_mode_e quantMode;
+  if (output_bits == 8) {
+    quantMode = quantMode8Bit;
+  } else {
+    quantMode = quantMode32Bit;
+  }
+
+  task->qw = weight_bits;
+  task->data.cfg.conf0 &=
+      ~(NEUREKA_MASK_QUANT_MODE | NEUREKA_MASK_FLAG_WEIGHT_BITS);
+  task->data.cfg.conf0 |= quantMode | (weight_bits - 1);
+}
+
+void neureka_task_set_norm_quant(neureka_task_t *task, neureka_quant_t quant,
+                                 neureka_norm_t norm) {
+  task->data.cfg.conf0 &=
+      ~(NEUREKA_MASK_QUANT_FUNCTION | NEUREKA_MASK_SHIFT_AMOUNT |
+        NEUREKA_MASK_NORM_MODE | NEUREKA_MASK_FLAG_NORM_BIAS |
+        NEUREKA_MASK_FLAG_NORM_SHIFT);
+  task->data.cfg.conf0 |= NEUREKA_FLAG_NORM_QUANT | quant.function |
+                          (quant.shift_amount << 16) | norm.mode |
+                          norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS |
+                          norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT;
+}
+
+void neureka_task_set_weight_offset(
+    neureka_task_t *task, neureka_weight_offset_mode_e weight_offset_mode,
+    const int32_t weight_offset) {
+  task->data.cfg.conf0 &= ~NEUREKA_MASK_WEIGHT_OFFSET_MODE;
+  task->data.cfg.conf0 |= weight_offset_mode;
+  task->data.cfg.weight_offset_factor = weight_offset;
+}
+
+void neureka_task_set_input_signed(neureka_task_t *task) {
+  task->data.cfg.conf0 |= NEUREKA_FLAG_INPUT_SIGNED;
+}
+
+void neureka_task_set_input_unsigned(neureka_task_t *task) {
+  task->data.cfg.conf0 &= ~NEUREKA_FLAG_INPUT_SIGNED;
+}
 
-  task->data.cfg.weight_offset_factor = weights_offset_factor;
+void neureka_task_set_weight_source(neureka_task_t *task,
+                                    neureka_weight_source_e weight_source) {
+  task->data.cfg.conf0 &= ~NEUREKA_MASK_FLAG_WEIGHT_SOURCE;
+  task->data.cfg.conf0 |= weight_source;
 }
 
 /** neureka_pad_ptr
@@ -84,18 +122,18 @@ void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
  * Necessary for input pointer when it's padded.
  */
 uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
-                                const uint32_t channel, const uint8_t bits,
-                                const uint8_t padding_top,
-                                const uint8_t padding_left) {
+                         const uint32_t channel, const uint8_t bits,
+                         const uint8_t padding_top,
+                         const uint8_t padding_left) {
   return ptr - (padding_top * width + padding_left) * channel * bits / 8;
 }
 
 void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
-                                  uint32_t w_in, uint32_t k_in, uint8_t bits_in,
-                                  uint8_t padding_top, uint8_t padding_left,
-                                  uint32_t output_ptr, uint32_t weights_ptr,
-                                  uint32_t scale_ptr, uint32_t shift_ptr,
-                                  uint32_t bias_ptr) {
+                           uint32_t w_in, uint32_t k_in, uint8_t bits_in,
+                           uint8_t padding_top, uint8_t padding_left,
+                           uint32_t output_ptr, uint32_t weights_ptr,
+                           uint32_t scale_ptr, uint32_t shift_ptr,
+                           uint32_t bias_ptr) {
   task->data.infeat_ptr = neureka_pad_ptr(input_ptr, w_in, k_in, bits_in,
                                           padding_top, padding_left);
   task->data.outfeat_ptr = output_ptr;
@@ -106,31 +144,28 @@ void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
 }
 
 void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
+                              const uint32_t h_in_stride,
                               const uint32_t w_in_stride,
-                              const uint32_t k_in_stride,
-                              const uint32_t w_out_stride,
-                              const uint32_t k_out_stride) {
-  const uint32_t num_k_in = divnceil(k_in, task->input_channel_throughput);
+                              const uint32_t h_out_stride,
+                              const uint32_t w_out_stride) {
+  const uint32_t num_k_in = divnceil(k_in, task->subtile_input_channel);
 
   const neureka_stride_t input_stride = {
-      .d0 = k_in_stride,
-      .d1 = k_in_stride * w_in_stride,
-      .d2 = 0 // Unused
-  };
+      .d0 = w_in_stride, .d1 = h_in_stride, .d2 = 0};
   task->data.cfg.input_stride = input_stride;
 
-  const neureka_stride_t output_stride = {
-      .d0 = 32, // TODO: should depend on outbytes. Probably 32 / outbytes
-      .d1 = k_out_stride * task->outbytes,
-      .d2 = k_out_stride * task->outbytes * w_out_stride};
+  const neureka_stride_t output_stride = {.d0 = NEUREKA_OUTPUT_BANDWIDTH_BYTES,
+                                          .d1 = w_out_stride,
+                                          .d2 = h_out_stride};
   task->data.cfg.output_stride = output_stride;
 
-  task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_D0_STRIDE;
+  task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES;
   if (task->kernel_shape == 1) { // 1x1
-    task->data.cfg.weights_stride.d1 = NEUREKA_WEIGHT_D0_STRIDE * num_k_in;
+    task->data.cfg.weights_stride.d1 =
+        NEUREKA_WEIGHT_BANDWIDTH_BYTES * num_k_in;
   } else if (!task->depthwise) { // 3x3
     task->data.cfg.weights_stride.d1 =
-        NEUREKA_WEIGHT_D0_STRIDE * task->qw * num_k_in;
+        NEUREKA_WEIGHT_BANDWIDTH_BYTES * task->qw * num_k_in;
   } else { // 3x3 depthwise
     task->data.cfg.weights_stride.d1 = 0;
   }
@@ -142,15 +177,15 @@ void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
                                const uint32_t k_out,
                                const uint8_t padding_bottom,
                                const uint8_t padding_right) {
-  const uint16_t num_Ko = divnceil(k_out, task->output_channel_throughput);
-  const uint16_t num_Ki = divnceil(k_in, task->input_channel_throughput);
-  const uint16_t num_Ho = divnceil(h_out, NEUREKA_COMPUTE_SIZE_HEIGHT);
-  const uint16_t num_Wo = divnceil(w_out, NEUREKA_COMPUTE_SIZE_WIDTH);
-
-  const uint16_t rem_Ko = remainder(k_out, task->output_channel_throughput);
-  const uint16_t rem_Ki = remainder(k_in, task->input_channel_throughput);
-  const uint16_t rem_Ho = remainder(h_out, NEUREKA_COMPUTE_SIZE_HEIGHT);
-  const uint16_t rem_Wo = remainder(w_out, NEUREKA_COMPUTE_SIZE_WIDTH);
+  const uint16_t num_Ko = divnceil(k_out, task->subtile_output_channel);
+  const uint16_t num_Ki = divnceil(k_in, task->subtile_input_channel);
+  const uint16_t num_Ho = divnceil(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT);
+  const uint16_t num_Wo = divnceil(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH);
+
+  const uint16_t rem_Ko = remainder(k_out, task->subtile_output_channel);
+  const uint16_t rem_Ki = remainder(k_in, task->subtile_input_channel);
+  const uint16_t rem_Ho = remainder(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT);
+  const uint16_t rem_Wo = remainder(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH);
   const uint16_t rem_Hi =
       rem_Ho == 0 ? 0
                   : (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) -
@@ -170,30 +205,29 @@ void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
 }
 
 void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
-                                     const uint8_t bottom, const uint8_t left,
-                                     const uint8_t right, const uint8_t value) {
+                              const uint8_t bottom, const uint8_t left,
+                              const uint8_t right, const uint8_t value) {
   task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) |
                            ((bottom & 0xf) << 20) | ((left & 0xf) << 16) |
                            (value & 0xff);
 }
 
-void neureka_task_set_mask_filter(neureka_task_t *task,
-                                         const uint8_t top, const uint8_t right,
-                                         const uint8_t bottom,
-                                         const uint8_t left) {
+void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
+                                  const uint8_t right, const uint8_t bottom,
+                                  const uint8_t left) {
   task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
                                ((bottom & 0xff) << 8) | ((left & 0xff) << 0);
 }
 
 void neureka_task_set_dims(
     neureka_task_t *task, const uint32_t w_in, const uint32_t k_in,
-    const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t h_in_stride, const uint32_t w_in_stride,
     const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint32_t h_out_stride, const uint32_t w_out_stride,
     const uint8_t padding_top, const uint8_t padding_bottom,
     const uint8_t padding_right, const uint8_t padding_left) {
-  neureka_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
-                           k_out_stride);
+  neureka_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride,
+                           w_out_stride);
   neureka_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,
                             padding_right);
   neureka_task_set_padding(task, padding_top, padding_bottom, padding_left,
diff --git a/neureka/hal/neureka_task.h b/neureka/hal/neureka_task.h
index 64356e6..a265223 100644
--- a/neureka/hal/neureka_task.h
+++ b/neureka/hal/neureka_task.h
@@ -29,6 +29,11 @@ typedef enum neureka_task_flag_e {
   neurekaTaskFlagTrue = 1
 } neureka_task_flag_e;
 
+typedef enum neureka_weight_source_e {
+  neurekaWeightSourceTcdm = NEUREKA_FLAG_WEIGHT_SOURCE_TCDM,
+  neurekaWeightSourceWmem = NEUREKA_FLAG_WEIGHT_SOURCE_WMEM
+} neureka_weight_source_e;
+
 typedef enum neureka_weight_offset_mode_e {
   weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC,
   weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE
@@ -36,7 +41,6 @@ typedef enum neureka_weight_offset_mode_e {
 
 typedef enum {
   normMode8Bit = NEUREKA_NORM_MODE_8BIT,
-  normMode16Bit = NEUREKA_NORM_MODE_16BIT,
   normMode32Bit = NEUREKA_NORM_MODE_32BIT
 } neureka_norm_mode_e;
 
@@ -48,7 +52,6 @@ typedef struct neureka_norm_t {
 
 typedef enum neureka_quant_mode_e {
   quantMode8Bit = NEUREKA_QUANT_MODE_8BIT,
-  quantMode16Bit = NEUREKA_QUANT_MODE_16BIT,
   quantMode32Bit = NEUREKA_QUANT_MODE_32BIT
 } neureka_quant_mode_e;
 
@@ -60,7 +63,6 @@ typedef enum neureka_quant_function_e {
 typedef struct neureka_quant_t {
   // Shift amount must be in range 0x00-0x1F
   unsigned shift_amount;
-  neureka_quant_mode_e mode;
   neureka_quant_function_e function;
   int flag_rounding;
 } neureka_quant_t;
@@ -110,22 +112,30 @@ typedef struct neureka_task_data_t {
 
 typedef struct neureka_task_t {
   neureka_task_data_t data;
-  uint8_t outbytes;
   uint8_t qw;
-  uint8_t output_channel_throughput;
-  uint8_t input_channel_throughput;
+  uint8_t subtile_output_channel;
+  uint8_t subtile_input_channel;
   uint8_t kernel_shape;
   uint8_t depthwise;
   uint8_t id;
 } neureka_task_t;
 
-void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
-                       const uint8_t depthwise, const uint8_t input_bits,
-                       const uint8_t output_bits, const uint8_t weights_bits,
-                       const neureka_weight_offset_mode_e weights_offset_mode,
-                       const uint32_t weights_offset_factor,
-                       neureka_quant_t quant, neureka_norm_t norm,
-                       const uint8_t flag_input_signed);
+void neureka_task_init(neureka_task_t *task);
+void neureka_task_set_op_to_conv(neureka_task_t *task,
+                                 const uint8_t kernel_shape,
+                                 const uint8_t depthwise, const uint8_t stride);
+void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits,
+                           const uint8_t output_bits,
+                           const uint8_t weight_bits);
+void neureka_task_set_norm_quant(neureka_task_t *task, neureka_quant_t quant,
+                                 neureka_norm_t norm);
+void neureka_task_set_weight_offset(
+    neureka_task_t *task, neureka_weight_offset_mode_e weight_offset_mode,
+    const int32_t weight_offset);
+void neureka_task_set_input_signed(neureka_task_t *task);
+void neureka_task_set_input_unsigned(neureka_task_t *task);
+void neureka_task_set_weight_source(neureka_task_t *task,
+                                    neureka_weight_source_e weight_source);
 uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
                                   uint32_t i_width, uint32_t n_height,
                                   uint32_t n_width);
@@ -138,11 +148,17 @@ void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
                            uint32_t output_ptr, uint32_t weights_ptr,
                            uint32_t scale_ptr, uint32_t shift_ptr,
                            uint32_t bias_ptr);
+/** neureka_task_set_strides
+ *
+ * All the strides variables are strides between elements alongside that
+ * dimension and expressed in bytes. There is no stride variable for the channel
+ * dimension because the N-EUREKA requires the channels to be contiguous.
+ */
 void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
+                              const uint32_t h_in_stride,
                               const uint32_t w_in_stride,
-                              const uint32_t k_in_stride,
-                              const uint32_t w_out_stride,
-                              const uint32_t k_out_stride);
+                              const uint32_t h_out_stride,
+                              const uint32_t w_out_stride);
 void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
                                const uint32_t h_out, const uint32_t w_out,
                                const uint32_t k_out,
@@ -154,11 +170,17 @@ void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
 void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
                                   const uint8_t right, const uint8_t bottom,
                                   const uint8_t left);
+/** neureka_task_set_dims
+ *
+ * All the strides variables are strides between elements alongside that
+ * dimension and expressed in bytes. There is no stride variable for the channel
+ * dimension because the N-EUREKA requires the channels to be contiguous.
+ */
 void neureka_task_set_dims(
     neureka_task_t *task, const uint32_t w_in, const uint32_t k_in,
-    const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t h_in_stride, const uint32_t w_in_stride,
     const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint32_t h_out_stride, const uint32_t w_out_stride,
     const uint8_t padding_top, const uint8_t padding_bottom,
     const uint8_t padding_right, const uint8_t padding_left);
 
diff --git a/neureka/hal/neureka_task_defs.h b/neureka/hal/neureka_task_defs.h
index 7ed77eb..fa08289 100644
--- a/neureka/hal/neureka_task_defs.h
+++ b/neureka/hal/neureka_task_defs.h
@@ -23,13 +23,20 @@
 
 /* ARHITECTURE */
 
-#define NEUREKA_COMPUTE_SIZE_HEIGHT (6)
-#define NEUREKA_COMPUTE_SIZE_WIDTH (6)
-#define NEUREKA_INPUT_CHANNEL_THROUGHPUT_1x1 (32)
-#define NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 (28)
-#define NEUREKA_OUTPUT_CHANNEL_THROUGHPUT (32)
-#define NEUREKA_WEIGHT_BANDWIDTH (256)
-#define NEUREKA_WEIGHT_D0_STRIDE (NEUREKA_WEIGHT_BANDWIDTH / 8)
+#define NEUREKA_SUBTILE_INPUT_HEIGHT_1x1 (6)
+#define NEUREKA_SUBTILE_INPUT_WIDTH_1x1 (6)
+#define NEUREKA_SUBTILE_INPUT_CHANNEL_1x1 (32)
+
+#define NEUREKA_SUBTILE_INPUT_HEIGHT_3x3 (8)
+#define NEUREKA_SUBTILE_INPUT_WIDTH_3x3 (8)
+#define NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 (28)
+
+#define NEUREKA_SUBTILE_OUTPUT_HEIGHT (6)
+#define NEUREKA_SUBTILE_OUTPUT_WIDTH (6)
+#define NEUREKA_SUBTILE_OUTPUT_CHANNEL (32)
+
+#define NEUREKA_OUTPUT_BANDWIDTH_BYTES (32)
+#define NEUREKA_WEIGHT_BANDWIDTH_BYTES (32)
 
 /* TASK REGISTERS */
 
@@ -65,7 +72,6 @@
 #define NEUREKA_SHIFT_FLAG_NORM_BIAS (25)
 #define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24)
 #define NEUREKA_SHIFT_QUANT_SHIFT (16)
-#define NEUREKA_SHIFT_ROUNDING (11)
 
 /*  CONF0 FLAGS */
 
@@ -75,7 +81,6 @@
 #define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23)
 #define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23)
 #define NEUREKA_QUANT_MODE_8BIT (0 << 21)
-#define NEUREKA_QUANT_MODE_16BIT (1 << 21) // not supported
 #define NEUREKA_QUANT_MODE_32BIT (2 << 21)
 // conf0[20:16] - quantization shift amount
 #define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15) // Unimplemented in gvsoc
@@ -83,25 +88,30 @@
   (1 << 15) // Unimplemented in gvsoc
 #define NEUREKA_FLAG_STREAMIN (1 << 14)
 #define NEUREKA_NORM_MODE_8BIT (0 << 12)
-#define NEUREKA_NORM_MODE_16BIT (1 << 12) // not supported
 #define NEUREKA_NORM_MODE_32BIT (2 << 12)
-#define NEUREKA_FLAG_ROUND (1 << 11) // not supported
 #define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10)
-#define NEUREKA_FLAG_USE_WMEM (1 << 9)
-#define NEUREKA_FLAG_USE_TCDM (0 << 9)
-#define NEUREKA_FLAG_STRIDE_2x2 (1 << 8)  // not supported
-#define NEUREKA_FLAG_LINEAR_MODE (1 << 7) // not supported
+#define NEUREKA_FLAG_WEIGHT_SOURCE_WMEM (1 << 9)
+#define NEUREKA_FLAG_WEIGHT_SOURCE_TCDM (0 << 9)
+#define NEUREKA_FLAG_LINEAR_MODE (1 << 7) // not tested
 #define NEUREKA_FLAG_MODE_3x3 (0 << 5)
 #define NEUREKA_FLAG_MODE_3x3_DW (1 << 5)
 #define NEUREKA_FLAG_MODE_1x1 (2 << 5)
 #define NEUREKA_FLAG_NORM_QUANT (1 << 4)
-#define NEUREKA_FLAG_MODE_BASIC (0 << 3)
-#define NEUREKA_FLAG_MODE16 (1 << 3) // not supported
 
 /* Masks */
 
-#define NEUREKA_MASK_QUANT_FUNCTION (1 << 23)
-#define NEUREKA_MASK_QUANT_MODE (3 << 21)
+#define NEUREKA_MASK_FLAG_INPUT_SIGNED (0x1 << 26)
+#define NEUREKA_MASK_FLAG_NORM_BIAS (0x1 << 25)
+#define NEUREKA_MASK_FLAG_NORM_SHIFT (0x1 << 24)
+#define NEUREKA_MASK_QUANT_FUNCTION (0x1 << 23)
+#define NEUREKA_MASK_QUANT_MODE (0x3 << 21)
+#define NEUREKA_MASK_SHIFT_AMOUNT (0x1f << 16)
+#define NEUREKA_MASK_WEIGHT_OFFSET_MODE (0x1 << 15)
+#define NEUREKA_MASK_NORM_MODE (0x3 << 12)
+#define NEUREKA_MASK_FLAG_ACTIVATION_PREFETCH (0x1 << 10)
+#define NEUREKA_MASK_FLAG_WEIGHT_SOURCE (0x1 << 9)
+#define NEUREKA_MASK_FLAG_MODE (0x3 << 5)
+#define NEUREKA_MASK_FLAG_WEIGHT_BITS (0x7 << 0)
 
 /* PADDING */
 
diff --git a/test/NeurekaTestConf.py b/test/NeurekaTestConf.py
index d896a7e..f878e68 100644
--- a/test/NeurekaTestConf.py
+++ b/test/NeurekaTestConf.py
@@ -59,7 +59,7 @@ def check_valid_in_type(cls, v: IntegerType) -> IntegerType:
     @field_validator("out_type")
     @classmethod
     def check_valid_out_type(cls, v: IntegerType) -> IntegerType:
-        NeurekaTestConf._check_type("out_type", v, ["uint8", "int8"])
+        NeurekaTestConf._check_type("out_type", v, ["uint8", "int8", "int32"])
         return v
 
     @field_validator("weight_type")
diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index 893f2fc..41317f6 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -29,6 +29,7 @@
 #include "ne16_task.h"
 #include "pulp_nnx_ne16.h"
 
+typedef ne16_norm_mode_e nnx_norm_mode_e;
 typedef ne16_quant_t nnx_quant_t;
 typedef ne16_norm_t nnx_norm_t;
 typedef ne16_task_t nnx_task_t;
@@ -39,6 +40,10 @@ typedef ne16_pulp_conf_t nnx_bsp_conf_t;
 #define nnxTaskFlagFalse ne16TaskFlagFalse
 
 #define nnx_task_init ne16_task_init
+#define nnx_task_set_op_to_conv ne16_task_set_op_to_conv
+#define nnx_task_set_bits ne16_task_set_bits
+#define nnx_task_set_norm_quant ne16_task_set_norm_quant
+#define nnx_task_set_weight_offset ne16_task_set_weight_offset
 #define nnx_task_set_dims ne16_task_set_dims
 #define nnx_task_set_dims_stride2x2 ne16_task_set_dims_stride2x2
 #define nnx_task_set_ptrs ne16_task_set_ptrs
@@ -65,6 +70,7 @@ typedef ne16_pulp_conf_t nnx_bsp_conf_t;
 #include "neureka_task.h"
 #include "pulp_nnx_neureka.h"
 
+typedef neureka_norm_mode_e nnx_norm_mode_e;
 typedef neureka_quant_t nnx_quant_t;
 typedef neureka_norm_t nnx_norm_t;
 typedef neureka_task_t nnx_task_t;
@@ -75,6 +81,10 @@ typedef neureka_siracusa_conf_t nnx_bsp_conf_t;
 #define nnxTaskFlagFalse neurekaTaskFlagFalse
 
 #define nnx_task_init neureka_task_init
+#define nnx_task_set_op_to_conv neureka_task_set_op_to_conv
+#define nnx_task_set_bits neureka_task_set_bits
+#define nnx_task_set_norm_quant neureka_task_set_norm_quant
+#define nnx_task_set_weight_offset neureka_task_set_weight_offset
 #define nnx_task_set_dims neureka_task_set_dims
 #define nnx_task_set_ptrs neureka_task_set_ptrs
 
@@ -103,17 +113,17 @@ typedef neureka_siracusa_conf_t nnx_bsp_conf_t;
 
 static void task_prepare(nnx_task_t *task) {
   nnx_task_init(task);
-  ne16_task_set_op_to_conv(task, WEIGHT_HEIGHT, GROUPS > 1, STRIDE_HEIGHT);
-  ne16_task_set_bits(task, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS);
+  nnx_task_set_op_to_conv(task, WEIGHT_HEIGHT, GROUPS > 1, STRIDE_HEIGHT);
+  nnx_task_set_bits(task, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS);
 
 #if HAS_NORM_QUANT == 1
 #if SCALE_BITS == 8
-  const ne16_norm_mode_e normMode = normMode8Bit;
+  const nnx_norm_mode_e normMode = normMode8Bit;
 #elif SCALE_BITS == 32
-  const ne16_norm_mode_e normMode = normMode32Bit;
+  const nnx_norm_mode_e normMode = normMode32Bit;
 #endif
 
-  ne16_task_set_norm_quant(
+  nnx_task_set_norm_quant(
       task,
       (nnx_quant_t){.shift_amount = OUTSHIFT,
                     .function =
@@ -123,8 +133,21 @@ static void task_prepare(nnx_task_t *task) {
                    .flag_bias = HAS_BIAS ? nnxTaskFlagTrue : nnxTaskFlagFalse,
                    .flag_shift = nnxTaskFlagFalse});
 #endif // HAS_NORM_QUANT
-  //
-  ne16_task_set_weight_offset(task, weightOffsetModeLayerWise, WEIGHT_OFFSET);
+
+  nnx_task_set_weight_offset(task, weightOffsetModeLayerWise, WEIGHT_OFFSET);
+
+#ifdef NNX_NEUREKA
+#ifdef NEUREKA_WEIGHT_SOURCE_WMEM
+  neureka_task_set_weight_source(task, neurekaWeightSourceWmem);
+#else
+  neureka_task_set_weight_source(task, neurekaWeightSourceTcdm);
+#endif
+#if INPUT_SIGNED == 1
+  neureka_task_set_input_signed(task);
+#else
+  neureka_task_set_input_unsigned(task);
+#endif
+#endif
 
   const uint32_t w_in_stride = INPUT_CHANNEL * INPUT_BITS / 8;
   const uint32_t h_in_stride = INPUT_WIDTH * w_in_stride;

From c436ea4915eac96f1f44c6af2f2732eb5b11333d Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Sat, 27 Jan 2024 15:15:39 +0100
Subject: [PATCH 57/72] Fix stride2x2 validity check for out channel to check
 stride evenness

---
 test/Ne16TestConf.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/Ne16TestConf.py b/test/Ne16TestConf.py
index efe75af..f2e66ad 100644
--- a/test/Ne16TestConf.py
+++ b/test/Ne16TestConf.py
@@ -85,9 +85,10 @@ def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType
         return v
 
     @model_validator(mode="after")  # type: ignore
-    def check_valid_out_channel_with_stride_2x2(self) -> Ne16TestConf:
+    def check_valid_out_channel_stride_with_stride_2x2(self) -> Ne16TestConf:
         assert implies(
-            self.stride == Stride(height=2, width=2), self.out_channel % 2 == 0
+            self.stride == Stride(height=2, width=2),
+            self.out_channel * (self.out_type._bits // 8) % 2 == 0,
         ), f"With stride 2x2 supported only even output channel sizes. Given output channel {self.out_channel}"
         return self
 

From eda8b5197568e1cf9edff2656e91fe2e3ef8d1bf Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Sat, 27 Jan 2024 15:16:24 +0100
Subject: [PATCH 58/72] Fix formatting

---
 test/NnxTestClasses.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/test/NnxTestClasses.py b/test/NnxTestClasses.py
index 90cfc71..a7aaa00 100644
--- a/test/NnxTestClasses.py
+++ b/test/NnxTestClasses.py
@@ -391,8 +391,16 @@ def generate(self, test_name: str, test: NnxTest):
                     "bits": weight_bits,
                     "offset": weight_offset,
                 },
-                "scale": {"bits": test.conf.scale_type._bits if test.conf.scale_type is not None else 0},
-                "bias": {"bits": test.conf.bias_type._bits if test.conf.bias_type is not None else 0},
+                "scale": {
+                    "bits": test.conf.scale_type._bits
+                    if test.conf.scale_type is not None
+                    else 0
+                },
+                "bias": {
+                    "bits": test.conf.bias_type._bits
+                    if test.conf.bias_type is not None
+                    else 0
+                },
                 "padding": {
                     "top": test.conf.padding.top,
                     "bottom": test.conf.padding.bottom,

From 8b37485b2596ca44ed40436a1392b35f235a487d Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Sat, 27 Jan 2024 15:30:13 +0100
Subject: [PATCH 59/72] Remove TODO's cause neureka clearly needs these
 functions

---
 neureka/bsp/neureka_siracusa_bsp.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/neureka/bsp/neureka_siracusa_bsp.c b/neureka/bsp/neureka_siracusa_bsp.c
index 9437250..57136fd 100644
--- a/neureka/bsp/neureka_siracusa_bsp.c
+++ b/neureka/bsp/neureka_siracusa_bsp.c
@@ -35,25 +35,21 @@
 #define NEUREKA_SIRACUSA_WEIGHT_MEM_MRAM_OFFSET (0x00000000)
 #define NEUREKA_SIRACUSA_WEIGHT_MEM_SRAM_OFFSET (0x00400000)
 
-// TODO: Check if needed for neureka
 void neureka_siracusa_hci_setpriority_neureka() {
   *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |=
       NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO;
 }
 
-// TODO: Check if needed for neureka
 void neureka_siracusa_hci_setpriority_core() {
   *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &=
       ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO;
 }
 
-// TODO: Check if needed for neureka
 void neureka_siracusa_hci_reset_max_stall() {
   *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &=
       ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL;
 }
 
-// TODO: Check if needed for neureka
 void neureka_siracusa_hci_set_max_stall(uint32_t max_stall) {
   *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |=
       max_stall & NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL;

From d9c45efe27e0918e620221a9b0839f8e5eb81d89 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Sat, 27 Jan 2024 15:47:20 +0100
Subject: [PATCH 60/72] Remove equal 0 check because that can never be the case
 due to the remainder function

---
 neureka/hal/neureka_task.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
index 35d0745..d712aee 100644
--- a/neureka/hal/neureka_task.c
+++ b/neureka/hal/neureka_task.c
@@ -186,14 +186,10 @@ void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
   const uint16_t rem_Ki = remainder(k_in, task->subtile_input_channel);
   const uint16_t rem_Ho = remainder(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT);
   const uint16_t rem_Wo = remainder(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH);
-  const uint16_t rem_Hi =
-      rem_Ho == 0 ? 0
-                  : (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) -
-                        padding_bottom; // TODO: Check padding bottom
-  const uint16_t rem_Wi =
-      rem_Wo == 0 ? 0
-                  : (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) -
-                        padding_right; // TODO: Check padding right
+  const uint16_t rem_Hi = (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) -
+                          padding_bottom; // TODO: Check padding bottom
+  const uint16_t rem_Wi = (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) -
+                          padding_right; // TODO: Check padding right
 
   const neureka_subtile_t subtile = {
       .number = {.KoKi = concat_half(num_Ko, num_Ki),

From 29ee4838519153783fdeeeb8020c77024414ca3b Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Sat, 27 Jan 2024 15:59:25 +0100
Subject: [PATCH 61/72] Remove TODOs, checked padding

---
 neureka/hal/neureka_task.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
index d712aee..3527ac0 100644
--- a/neureka/hal/neureka_task.c
+++ b/neureka/hal/neureka_task.c
@@ -186,10 +186,10 @@ void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
   const uint16_t rem_Ki = remainder(k_in, task->subtile_input_channel);
   const uint16_t rem_Ho = remainder(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT);
   const uint16_t rem_Wo = remainder(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH);
-  const uint16_t rem_Hi = (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) -
-                          padding_bottom; // TODO: Check padding bottom
-  const uint16_t rem_Wi = (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) -
-                          padding_right; // TODO: Check padding right
+  const uint16_t rem_Hi =
+      (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom;
+  const uint16_t rem_Wi =
+      (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right;
 
   const neureka_subtile_t subtile = {
       .number = {.KoKi = concat_half(num_Ko, num_Ki),

From d9c7723a7a77485ad9f918b1f22d6ede2020c0c1 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Sat, 27 Jan 2024 16:28:58 +0100
Subject: [PATCH 62/72] Rename divnceil and remainder, and add nnx_ prefix

---
 ne16/hal/ne16_task.c       | 39 +++++++++++++++++++++++---------------
 neureka/hal/neureka_task.c | 39 +++++++++++++++++++++++---------------
 src/pulp_nnx_ne16.c        |  4 ++--
 util/pulp_nnx_util.c       | 12 +++++++-----
 util/pulp_nnx_util.h       | 20 ++++++++++---------
 5 files changed, 68 insertions(+), 46 deletions(-)

diff --git a/ne16/hal/ne16_task.c b/ne16/hal/ne16_task.c
index 21518a7..5f856e4 100644
--- a/ne16/hal/ne16_task.c
+++ b/ne16/hal/ne16_task.c
@@ -133,7 +133,8 @@ void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in,
                            const uint32_t w_in_stride,
                            const uint32_t h_out_stride,
                            const uint32_t w_out_stride) {
-  const uint32_t num_k_in = divnceil(k_in, NE16_SUBTILE_INPUT_CHANNEL);
+  const uint32_t num_k_in =
+      nnx_calculate_number_of_tiles(k_in, NE16_SUBTILE_INPUT_CHANNEL);
 
   const ne16_stride_t input_stride = {
       .d0 = w_in_stride, .d1 = h_in_stride, .d2 = 0};
@@ -166,26 +167,34 @@ void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in,
                             const uint32_t h_out, const uint32_t w_out,
                             const uint32_t k_out, const uint8_t padding_bottom,
                             const uint8_t padding_right) {
-  const uint16_t num_Ko = divnceil(k_out, task->subtile_output_channel);
-  const uint16_t num_Ki = divnceil(k_in, NE16_SUBTILE_INPUT_CHANNEL);
-  const uint16_t num_Ho = divnceil(h_out, NE16_SUBTILE_OUTPUT_HEIGHT);
-  const uint16_t num_Wo = divnceil(w_out, NE16_SUBTILE_OUTPUT_WIDTH);
-
-  const uint16_t rem_Ko = remainder(k_out, task->subtile_output_channel);
-  const uint16_t rem_Ki = remainder(k_in, NE16_SUBTILE_INPUT_CHANNEL);
-  const uint16_t rem_Ho = remainder(h_out, NE16_SUBTILE_OUTPUT_HEIGHT);
-  const uint16_t rem_Wo = remainder(w_out, NE16_SUBTILE_OUTPUT_WIDTH);
+  const uint16_t num_Ko =
+      nnx_calculate_number_of_tiles(k_out, task->subtile_output_channel);
+  const uint16_t num_Ki =
+      nnx_calculate_number_of_tiles(k_in, NE16_SUBTILE_INPUT_CHANNEL);
+  const uint16_t num_Ho =
+      nnx_calculate_number_of_tiles(h_out, NE16_SUBTILE_OUTPUT_HEIGHT);
+  const uint16_t num_Wo =
+      nnx_calculate_number_of_tiles(w_out, NE16_SUBTILE_OUTPUT_WIDTH);
+
+  const uint16_t rem_Ko =
+      nnx_calculate_last_tile_size(k_out, task->subtile_output_channel);
+  const uint16_t rem_Ki =
+      nnx_calculate_last_tile_size(k_in, NE16_SUBTILE_INPUT_CHANNEL);
+  const uint16_t rem_Ho =
+      nnx_calculate_last_tile_size(h_out, NE16_SUBTILE_OUTPUT_HEIGHT);
+  const uint16_t rem_Wo =
+      nnx_calculate_last_tile_size(w_out, NE16_SUBTILE_OUTPUT_WIDTH);
   const uint16_t rem_Hi =
       (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom;
   const uint16_t rem_Wi =
       (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right;
 
   const ne16_subtile_t subtile = {
-      .number = {.KoKi = concat_half(num_Ko, num_Ki),
-                 .HoWo = concat_half(num_Ho, num_Wo)},
-      .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
-                    .HoWo = concat_half(rem_Ho, rem_Wo),
-                    .HiWi = concat_half(rem_Hi, rem_Wi)}};
+      .number = {.KoKi = nnx_concat_half(num_Ko, num_Ki),
+                 .HoWo = nnx_concat_half(num_Ho, num_Wo)},
+      .remainder = {.KoKi = nnx_concat_half(rem_Ko, rem_Ki),
+                    .HoWo = nnx_concat_half(rem_Ho, rem_Wo),
+                    .HiWi = nnx_concat_half(rem_Hi, rem_Wi)}};
   task->data.cfg.subtile = subtile;
 }
 
diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
index 3527ac0..4541f9d 100644
--- a/neureka/hal/neureka_task.c
+++ b/neureka/hal/neureka_task.c
@@ -148,7 +148,8 @@ void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
                               const uint32_t w_in_stride,
                               const uint32_t h_out_stride,
                               const uint32_t w_out_stride) {
-  const uint32_t num_k_in = divnceil(k_in, task->subtile_input_channel);
+  const uint32_t num_k_in =
+      nnx_calculate_number_of_tiles(k_in, task->subtile_input_channel);
 
   const neureka_stride_t input_stride = {
       .d0 = w_in_stride, .d1 = h_in_stride, .d2 = 0};
@@ -177,26 +178,34 @@ void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
                                const uint32_t k_out,
                                const uint8_t padding_bottom,
                                const uint8_t padding_right) {
-  const uint16_t num_Ko = divnceil(k_out, task->subtile_output_channel);
-  const uint16_t num_Ki = divnceil(k_in, task->subtile_input_channel);
-  const uint16_t num_Ho = divnceil(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT);
-  const uint16_t num_Wo = divnceil(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH);
-
-  const uint16_t rem_Ko = remainder(k_out, task->subtile_output_channel);
-  const uint16_t rem_Ki = remainder(k_in, task->subtile_input_channel);
-  const uint16_t rem_Ho = remainder(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT);
-  const uint16_t rem_Wo = remainder(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH);
+  const uint16_t num_Ko =
+      nnx_calculate_number_of_tiles(k_out, task->subtile_output_channel);
+  const uint16_t num_Ki =
+      nnx_calculate_number_of_tiles(k_in, task->subtile_input_channel);
+  const uint16_t num_Ho =
+      nnx_calculate_number_of_tiles(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT);
+  const uint16_t num_Wo =
+      nnx_calculate_number_of_tiles(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH);
+
+  const uint16_t rem_Ko =
+      nnx_calculate_last_tile_size(k_out, task->subtile_output_channel);
+  const uint16_t rem_Ki =
+      nnx_calculate_last_tile_size(k_in, task->subtile_input_channel);
+  const uint16_t rem_Ho =
+      nnx_calculate_last_tile_size(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT);
+  const uint16_t rem_Wo =
+      nnx_calculate_last_tile_size(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH);
   const uint16_t rem_Hi =
       (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom;
   const uint16_t rem_Wi =
       (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right;
 
   const neureka_subtile_t subtile = {
-      .number = {.KoKi = concat_half(num_Ko, num_Ki),
-                 .HoWo = concat_half(num_Ho, num_Wo)},
-      .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
-                    .HoWo = concat_half(rem_Ho, rem_Wo),
-                    .HiWi = concat_half(rem_Hi, rem_Wi)}};
+      .number = {.KoKi = nnx_concat_half(num_Ko, num_Ki),
+                 .HoWo = nnx_concat_half(num_Ho, num_Wo)},
+      .remainder = {.KoKi = nnx_concat_half(rem_Ko, rem_Ki),
+                    .HoWo = nnx_concat_half(rem_Ho, rem_Wo),
+                    .HiWi = nnx_concat_half(rem_Hi, rem_Wi)}};
   task->data.cfg.subtile = subtile;
 }
 
diff --git a/src/pulp_nnx_ne16.c b/src/pulp_nnx_ne16.c
index 99a2c9c..f9799fc 100644
--- a/src/pulp_nnx_ne16.c
+++ b/src/pulp_nnx_ne16.c
@@ -91,8 +91,8 @@ void ne16_nnx_dispatch_stride2x2(ne16_dev_t *dev, ne16_task_t *task,
                                  const uint8_t w_ker) {
   const uint8_t stride = 2;
 
-  const uint32_t n_h = divnceil(h_out, stride);
-  const uint32_t n_w = divnceil(w_out, stride);
+  const uint32_t n_h = nnx_calculate_number_of_tiles(h_out, stride);
+  const uint32_t n_w = nnx_calculate_number_of_tiles(w_out, stride);
   const uint32_t input_height_offset = h_out % stride == 1 ? stride : 0;
   const uint32_t input_width_offset = w_out % stride == 1 ? stride : 0;
   const uint32_t output_height_offset = h_out % stride == 1 ? 1 : 0;
diff --git a/util/pulp_nnx_util.c b/util/pulp_nnx_util.c
index 34db512..0107fc1 100644
--- a/util/pulp_nnx_util.c
+++ b/util/pulp_nnx_util.c
@@ -20,14 +20,16 @@
 
 #include "pulp_nnx_util.h"
 
-inline int divnceil(const int dividend, const int divisor) {
-  return ((dividend - 1) / divisor) + 1;
+inline int nnx_calculate_number_of_tiles(const int dim_size,
+                                         const int tile_size) {
+  return ((dim_size - 1) / tile_size) + 1;
 }
 
-inline int remainder(const int dividend, const int divisor) {
-  return ((dividend - 1) % divisor) + 1;
+inline int nnx_calculate_last_tile_size(const int dim_size,
+                                        const int tile_size) {
+  return ((dim_size - 1) % tile_size) + 1;
 }
 
-inline uint32_t concat_half(const uint16_t high, const uint16_t low) {
+inline uint32_t nnx_concat_half(const uint16_t high, const uint16_t low) {
   return ((uint32_t)high << 16) | low;
 }
diff --git a/util/pulp_nnx_util.h b/util/pulp_nnx_util.h
index 638e5d9..d167f6d 100644
--- a/util/pulp_nnx_util.h
+++ b/util/pulp_nnx_util.h
@@ -24,26 +24,28 @@
 #include <stdint.h>
 
 /**
- * divnceil
+ * nnx_calculate_number_of_iterations
  *
- * Does integer division and ceiling of it.
+ * Calculates the number of iterations to go through a dimension.
+ * It does it by dividing the dimension with the tile size and doing a ceiling
+ * the result.
  */
-int divnceil(const int dividend, const int divisor);
+int nnx_calculate_number_of_tiles(const int dim_size, const int tile_size);
 
 /**
- * remainder
+ * nnx_calculate_last_tile_size
  *
- * Calculates the remainder but if the remainder should be 0,
- * returns divisor. Used for calculation of the last `remainding`
- * iteration of the tile.
+ * Calculates the size of the last executed tile by calculating the remainder of
+ * the dim_size and the tile_size. In case the remainder is 0, it returns the
+ * full tile_size.
  */
-int remainder(const int dividend, const int divisor);
+int nnx_calculate_last_tile_size(const int dim_size, const int tile_size);
 
 /**
  * concat_half
  *
  * Concatenate 2 16-bit numbers into a 32-bit number.
  */
-uint32_t concat_half(const uint16_t high, const uint16_t low);
+uint32_t nnx_concat_half(const uint16_t high, const uint16_t low);
 
 #endif // __NNX_UTIL_H__

From 37ba86c8549e0a2eacb1e6353591f64e771a24c7 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 29 Jan 2024 11:25:03 +0100
Subject: [PATCH 63/72] Add citation

---
 README.md | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0daa8fd..8ef8b92 100644
--- a/README.md
+++ b/README.md
@@ -55,9 +55,30 @@ All the development should be done through forks and merged onto the `dev` branc
 
 The library will follow the [Semantic Versioning](https://semver.org/).
 
-## Citing
+## Publication
 
-*TBA*
+<details>
+<summary>If you use PULP-NNX in your work, you can cite us:</summary>
+```
+@inproceedings{10.1145/3607889.3609092,
+    author = {Macan, Luka and Burrello, Alessio and Benini, Luca and Conti, Francesco},
+    title = {WIP: Automatic DNN Deployment on Heterogeneous Platforms: the GAP9 Case Study},
+    year = {2024},
+    isbn = {9798400702907},
+    publisher = {Association for Computing Machinery},
+    address = {New York, NY, USA},
+    url = {https://doi.org/10.1145/3607889.3609092},
+    doi = {10.1145/3607889.3609092},
+    abstract = {Emerging Artificial-Intelligence-enabled System-on-Chips (AI-SoCs) combine a flexible microcontroller with parallel Digital Signal Processors (DSP) and heterogeneous acceleration capabilities. In this Work-in-Progress paper, we focus on the GAP9 RISC-V SoC as a case study to show how the open-source DORY Deep Neural Network (DNN) tool flow can be extended for heterogeneous acceleration by fine grained interleaving of a dedicated Neural Engine and a cluster of RISC-V cores. Our results show that up to 91\% of the peak accelerator throughput can be extracted in end-to-end execution of benchmarks based on MobileNet-V1 and V2.},
+    booktitle = {Proceedings of the International Conference on Compilers, Architecture, and Synthesis for Embedded Systems},
+    pages = {9–10},
+    numpages = {2},
+    keywords = {TinyML, MCUs, deep learning, HW accelerators},
+    location = {<conf-loc>, <city>Hamburg</city>, <country>Germany</country>, </conf-loc>},
+    series = {CASES '23 Companion}
+}
+```
+</details>
 
 ## Contributors
 

From 9e0b211029122220e7f8ee54c19b1d50384d228d Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 29 Jan 2024 15:25:30 +0100
Subject: [PATCH 64/72] Add sdk and compiler commit hashes

---
 README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/README.md b/README.md
index 8ef8b92..e64ade1 100644
--- a/README.md
+++ b/README.md
@@ -46,6 +46,15 @@ _Note: The accelerator can provide additional helper functions if needed._
 
 You can find information about testing in the dedicated [README](test/README.md).
 
+### Environment
+
+The library was tested with following pairs of SDKs and compilers:
+
+| SDK | SDK Commit Hash | Compiler | Compiler Commit Hash |
+| --- | --------------- | -------- | -------------------- |
+| gap\_sdk (obtainable from GreenWaves Technologies) | 90df4ce219 | [gap\_gnu\_toolchain](https://github.com/GreenWaves-Technologies/gap_gnu_toolchain) | 360fd4f9d6 |
+| [pulp-sdk](https://github.com/Scheremo/pulp-sdk) | 38e2754c4a | [pulp-riscv-gnu-toolchain](https://github.com/GreenWaves-Technologies/gap_gnu_toolchain) | 9938bd8fcf (release v1.0.16) |
+
 ## Contributing
 
 Bug reports and feature requests should be reported through issues.

From 1a4f873327ea3b7271e0a313a34ce3661bad5d21 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 29 Jan 2024 15:29:22 +0100
Subject: [PATCH 65/72] Change task size to a define

---
 ne16/hal/ne16.c       | 2 --
 ne16/hal/ne16.h       | 3 ++-
 neureka/hal/neureka.c | 2 --
 neureka/hal/neureka.h | 3 ++-
 4 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/ne16/hal/ne16.c b/ne16/hal/ne16.c
index 97859b4..d92a7d5 100644
--- a/ne16/hal/ne16.c
+++ b/ne16/hal/ne16.c
@@ -23,8 +23,6 @@
 #define NE16_STATUS_EMPTY (0x000)
 #define NE16_STATUS_FULL (0x101)
 
-inline int ne16_task_queue_size(ne16_dev_t *dev) { return 2; }
-
 inline int ne16_task_queue_tasks_in_flight(ne16_dev_t *dev) {
   uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev);
   return (status & 0x1) + ((status >> 8) & 0x1);
diff --git a/ne16/hal/ne16.h b/ne16/hal/ne16.h
index c4c3a19..88ebee7 100644
--- a/ne16/hal/ne16.h
+++ b/ne16/hal/ne16.h
@@ -24,11 +24,12 @@
 #include "hwpe.h"
 #include <stdint.h>
 
+#define NE16_TASK_QUEUE_SIZE (2)
+
 typedef struct ne16_dev_t {
   hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */
 } ne16_dev_t;
 
-int ne16_task_queue_size(ne16_dev_t *dev);
 int ne16_task_queue_tasks_in_flight(ne16_dev_t *dev);
 int ne16_task_queue_empty(ne16_dev_t *dev);
 int ne16_task_queue_full(ne16_dev_t *dev);
diff --git a/neureka/hal/neureka.c b/neureka/hal/neureka.c
index ebcad93..dc829d9 100644
--- a/neureka/hal/neureka.c
+++ b/neureka/hal/neureka.c
@@ -23,8 +23,6 @@
 #define NEUREKA_STATUS_EMPTY (0x000)
 #define NEUREKA_STATUS_FULL (0x101)
 
-inline int neureka_task_queue_size(neureka_dev_t *dev) { return 2; }
-
 inline int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev) {
   uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev);
   return (status & 0x1) + ((status >> 8) & 0x1);
diff --git a/neureka/hal/neureka.h b/neureka/hal/neureka.h
index 887d995..eae77a1 100644
--- a/neureka/hal/neureka.h
+++ b/neureka/hal/neureka.h
@@ -24,11 +24,12 @@
 #include "hwpe.h"
 #include <stdint.h>
 
+#define NEUREKA_TASK_QUEUE_SIZE (2)
+
 typedef struct neureka_dev_t {
   hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */
 } neureka_dev_t;
 
-int neureka_task_queue_size(neureka_dev_t *dev);
 int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev);
 int neureka_task_queue_empty(neureka_dev_t *dev);
 int neureka_task_queue_full(neureka_dev_t *dev);

From 04127591715fc29fa8d7670d320e4c7fafe1babe Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 29 Jan 2024 15:33:55 +0100
Subject: [PATCH 66/72] Update supported acc supported features

---
 ne16/README.md    | 5 ++---
 neureka/README.md | 9 ++++-----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/ne16/README.md b/ne16/README.md
index 2876b0a..9f05956 100644
--- a/ne16/README.md
+++ b/ne16/README.md
@@ -12,7 +12,7 @@
 - [x] Stride 2x2
 - [ ] Normalization and quantization
     - [x] With
-    - [ ] Without
+    - [x] Without
     - [x] Relu (w/ and w/o)
     - [x] Bias (w/ and w/o)
     - [ ] Per-channel shift
@@ -24,8 +24,7 @@
 - [ ] Output type
     - [x] int8
     - [x] uint8 (only w/ Relu)
-    - [ ] int32
-    - [ ] uint32 (only w/ Relu)
+    - [x] int32
 - [ ] Scale type
     - [x] uint8
     - [ ] uint16
diff --git a/neureka/README.md b/neureka/README.md
index bca128a..9c83f4e 100644
--- a/neureka/README.md
+++ b/neureka/README.md
@@ -11,20 +11,19 @@ Github repo [link](https://github.com/siracusa-soc/ne).
 - [x] Depthwise convolution w/ kernel shape 3x3
 - [ ] Normalization and quantization
     - [x] With
-    - [ ] Without
+    - [x] Without
     - [x] Relu (w/ and w/o)
     - [x] Bias (w/ and w/o)
     - [ ] Per-channel shift
     - [x] Per-layer shift
     - [ ] Rounding
-- [ ] Input type
+- [x] Input type
     - [x] uint8
     - [x] int8
-- [ ] Output type
+- [x] Output type
     - [x] int8
     - [x] uint8 (only w/ Relu)
-    - [ ] int32
-    - [ ] uint32 (only w/ Relu)
+    - [x] int32
 - [ ] Scale type
     - [x] uint8
     - [ ] uint32

From b9f3de4a3371f85497dad02411d8817006e73924 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 29 Jan 2024 16:25:36 +0100
Subject: [PATCH 67/72] Update changelog

---
 CHANGELOG.md | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 48a4461..0b6a7d0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,25 @@
 # Changelog
 
+## [Unreleased]
+
+### Added
+
+- N-EUREKA accelerator support: 3x3, 1x1, and 3x3 depthwise convolution kernels
+- Support for kernels without normalization and quantization for NE16
+- isort check
+- publication citation
+
+### Changed
+
+- `ne16_task_init` got split into smaller parts: `ne16_task_init`, `ne16_task_set_op_to_conv`, `ne16_task_set_weight_offset`, `ne16_task_set_bits`, `ne16_task_set_norm_quant`
+- strides in `ne16_task_set_strides`, and `ne16_task_set_dims` are now strides between consecutive elements in that dimension
+- `ne16_task_queue_size` is now `NE16_TASK_QUEUE_SIZE`
+
+### Removed
+
+- `k_in_stride`, `w_in_stride`, `k_out_stride`, and `w_out_stride` from `ne16_nnx_dispatch_stride2x2`
+- `mode` attribute from `ne16_quant_t` structure
+
 ## [0.3.0] - 2024-01-14
 
 ### Added

From 5990d83c1f1d55b665469c9784bf39307a3957df Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 29 Jan 2024 16:25:55 +0100
Subject: [PATCH 68/72] Update pulp-sdk commit hash

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e64ade1..a347588 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ The library was tested with following pairs of SDKs and compilers:
 | SDK | SDK Commit Hash | Compiler | Compiler Commit Hash |
 | --- | --------------- | -------- | -------------------- |
 | gap\_sdk (obtainable from GreenWaves Technologies) | 90df4ce219 | [gap\_gnu\_toolchain](https://github.com/GreenWaves-Technologies/gap_gnu_toolchain) | 360fd4f9d6 |
-| [pulp-sdk](https://github.com/Scheremo/pulp-sdk) | 38e2754c4a | [pulp-riscv-gnu-toolchain](https://github.com/GreenWaves-Technologies/gap_gnu_toolchain) | 9938bd8fcf (release v1.0.16) |
+| [pulp-sdk](https://github.com/Scheremo/pulp-sdk) | c216298881 | [pulp-riscv-gnu-toolchain](https://github.com/GreenWaves-Technologies/gap_gnu_toolchain) | 9938bd8fcf (release v1.0.16) |
 
 ## Contributing
 

From 07f47d9e752771846da939e12ed7c8c4c738cb11 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 29 Jan 2024 16:26:27 +0100
Subject: [PATCH 69/72] Remove -std=c11 flag

---
 test/app/Makefile | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/app/Makefile b/test/app/Makefile
index d73353f..ca65892 100644
--- a/test/app/Makefile
+++ b/test/app/Makefile
@@ -65,8 +65,4 @@ APP_CFLAGS += -DNNX_ACCELERATOR=\"$(ACCELERATOR)\" -DNNX_$(ACCELERATOR_UPPERCASE
 
 APP_CFLAGS += -O2 -w -Wall -Werror
 
-ifndef GAP_SDK_HOME
-APP_CFLAGS += -std=c11
-endif
-
 include $(RULES_DIR)/pmsis_rules.mk

From 07911629a892c3fe0a13319e9f7b16acd1d103ff Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 29 Jan 2024 16:34:51 +0100
Subject: [PATCH 70/72] Fix readme collapsable verbatim

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index a347588..1671dc7 100644
--- a/README.md
+++ b/README.md
@@ -68,6 +68,7 @@ The library will follow the [Semantic Versioning](https://semver.org/).
 
 <details>
 <summary>If you use PULP-NNX in your work, you can cite us:</summary>
+
 ```
 @inproceedings{10.1145/3607889.3609092,
     author = {Macan, Luka and Burrello, Alessio and Benini, Luca and Conti, Francesco},
@@ -87,6 +88,7 @@ The library will follow the [Semantic Versioning](https://semver.org/).
     series = {CASES '23 Companion}
 }
 ```
+
 </details>
 
 ## Contributors

From 9145445c16ce982fd1215f305389ee0ad68a0fe0 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 29 Jan 2024 16:45:23 +0100
Subject: [PATCH 71/72] Remove __PLATFORM__ check from the library since it's
 pulp-sdk specific

---
 ne16/gvsoc/ne16_gvsoc.h       | 4 ----
 neureka/gvsoc/neureka_gvsoc.h | 4 ----
 test/app/src/nnx_layer.c      | 4 ++++
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/ne16/gvsoc/ne16_gvsoc.h b/ne16/gvsoc/ne16_gvsoc.h
index e461416..f6626fd 100644
--- a/ne16/gvsoc/ne16_gvsoc.h
+++ b/ne16/gvsoc/ne16_gvsoc.h
@@ -42,17 +42,13 @@ typedef enum ne16_gvsoc_log_level_e {
 static void ne16_gvsoc_log_activate(ne16_dev_t *dev,
                                     ne16_gvsoc_log_level_e log_level,
                                     ne16_gvsoc_log_format_e format) {
-#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
   hwpe_task_reg_write(&dev->hwpe_dev, NE16_REG_GVSOC_LOG_LEVEL, log_level);
   hwpe_task_reg_write(&dev->hwpe_dev, NE16_REG_GVSOC_LOG_FORMAT, format);
-#endif
 }
 
 static void ne16_gvsoc_log_deactivate(ne16_dev_t *dev) {
-#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
   hwpe_task_reg_write(&dev->hwpe_dev, NE16_REG_GVSOC_LOG_LEVEL,
                       NE16_GVSOC_LOG_LEVEL_CONFIG);
-#endif
 }
 
 #endif // __NE16_GVSOC_H__
diff --git a/neureka/gvsoc/neureka_gvsoc.h b/neureka/gvsoc/neureka_gvsoc.h
index e163036..37eeab0 100644
--- a/neureka/gvsoc/neureka_gvsoc.h
+++ b/neureka/gvsoc/neureka_gvsoc.h
@@ -42,17 +42,13 @@ typedef enum neureka_gvsoc_log_level_e {
 static void neureka_gvsoc_log_activate(neureka_dev_t *dev,
                                        neureka_gvsoc_log_level_e log_level,
                                        neureka_gvsoc_log_format_e format) {
-#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
   hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL, log_level);
   hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_FORMAT, format);
-#endif
 }
 
 static void neureka_gvsoc_log_deactivate(neureka_dev_t *dev) {
-#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
   hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL,
                       NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END);
-#endif
 }
 
 #endif // __NEUREKA_GVSOC_H__
diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index 41317f6..486019d 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -186,7 +186,9 @@ static void task_prepare(nnx_task_t *task) {
 static void task_execute(nnx_task_t *task) {
   nnx_dev_t *dev = nnx_bsp_get_dev();
 
+#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
   nnx_gvsoc_log_activate(dev, NNX_GVSOC_LOG_LEVEL, NNX_GVSOC_LOG_FORMAT);
+#endif
 
   nnx_bsp_conf_t conf = {.max_stall = 8};
   nnx_init(dev, &conf);
@@ -205,7 +207,9 @@ static void task_execute(nnx_task_t *task) {
 
   nnx_term(dev);
 
+#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
   nnx_gvsoc_log_deactivate(dev);
+#endif
 }
 
 void execute_nnx_layer(void *args) {

From 6d24dd84fdfbe3649980b64a47f8395a0b50af09 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 30 Jan 2024 11:12:05 +0100
Subject: [PATCH 72/72] Change channel and bits with w_in_stride for set_ptrs

---
 CHANGELOG.md               |  2 +-
 ne16/hal/ne16_task.c       |  9 ++++-----
 ne16/hal/ne16_task.h       |  6 +++---
 neureka/hal/neureka_task.c | 11 +++++------
 neureka/hal/neureka_task.h |  6 +++---
 test/app/src/nnx_layer.c   |  4 ++--
 6 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0b6a7d0..84b516f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,7 +12,7 @@
 ### Changed
 
 - `ne16_task_init` got split into smaller parts: `ne16_task_init`, `ne16_task_set_op_to_conv`, `ne16_task_set_weight_offset`, `ne16_task_set_bits`, `ne16_task_set_norm_quant`
-- strides in `ne16_task_set_strides`, and `ne16_task_set_dims` are now strides between consecutive elements in that dimension
+- strides in `ne16_task_set_strides`, `ne16_task_set_dims`, and `ne16_task_set_ptrs` are now strides between consecutive elements in that dimension
 - `ne16_task_queue_size` is now `NE16_TASK_QUEUE_SIZE`
 
 ### Removed
diff --git a/ne16/hal/ne16_task.c b/ne16/hal/ne16_task.c
index 5f856e4..f8408da 100644
--- a/ne16/hal/ne16_task.c
+++ b/ne16/hal/ne16_task.c
@@ -108,19 +108,18 @@ void ne16_task_set_weight_offset(ne16_task_t *task,
  * it was the start to the padded data.
  * Necessary for input pointer when it's padded.
  */
-uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width,
-                      const uint32_t channel, const uint8_t bits,
+uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width, uint32_t width_stride,
                       const uint8_t padding_top, const uint8_t padding_left) {
-  return ptr - (padding_top * width + padding_left) * channel * bits / 8;
+  return ptr - (padding_top * width + padding_left) * width_stride;
 }
 
 void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in,
-                        uint32_t k_in, uint8_t bits_in, uint8_t padding_top,
+                        uint32_t w_in_stride, uint8_t padding_top,
                         uint8_t padding_left, uint32_t output_ptr,
                         uint32_t weights_ptr, uint32_t scale_ptr,
                         uint32_t shift_ptr, uint32_t bias_ptr) {
   task->data.infeat_ptr =
-      ne16_pad_ptr(input_ptr, w_in, k_in, bits_in, padding_top, padding_left);
+      ne16_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left);
   task->data.outfeat_ptr = output_ptr;
   task->data.weights_ptr = weights_ptr;
   task->data.scale_ptr = scale_ptr;
diff --git a/ne16/hal/ne16_task.h b/ne16/hal/ne16_task.h
index dd12c39..69bc78c 100644
--- a/ne16/hal/ne16_task.h
+++ b/ne16/hal/ne16_task.h
@@ -131,10 +131,10 @@ uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
                                uint32_t i_width, uint32_t n_height,
                                uint32_t n_width);
 uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width,
-                      const uint32_t channel, const uint8_t bits,
-                      const uint8_t padding_top, const uint8_t padding_left);
+                      const uint32_t width_stride, const uint8_t padding_top,
+                      const uint8_t padding_left);
 void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in,
-                        uint32_t k_in, uint8_t bits_in, uint8_t padding_top,
+                        uint32_t w_in_stride, uint8_t padding_top,
                         uint8_t padding_left, uint32_t output_ptr,
                         uint32_t weights_ptr, uint32_t scale_ptr,
                         uint32_t shift_ptr, uint32_t bias_ptr);
diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
index 4541f9d..501b2b9 100644
--- a/neureka/hal/neureka_task.c
+++ b/neureka/hal/neureka_task.c
@@ -122,20 +122,19 @@ void neureka_task_set_weight_source(neureka_task_t *task,
  * Necessary for input pointer when it's padded.
  */
 uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
-                         const uint32_t channel, const uint8_t bits,
-                         const uint8_t padding_top,
+                         const uint32_t width_stride, const uint8_t padding_top,
                          const uint8_t padding_left) {
-  return ptr - (padding_top * width + padding_left) * channel * bits / 8;
+  return ptr - (padding_top * width + padding_left) * width_stride;
 }
 
 void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
-                           uint32_t w_in, uint32_t k_in, uint8_t bits_in,
+                           uint32_t w_in, uint32_t w_in_stride,
                            uint8_t padding_top, uint8_t padding_left,
                            uint32_t output_ptr, uint32_t weights_ptr,
                            uint32_t scale_ptr, uint32_t shift_ptr,
                            uint32_t bias_ptr) {
-  task->data.infeat_ptr = neureka_pad_ptr(input_ptr, w_in, k_in, bits_in,
-                                          padding_top, padding_left);
+  task->data.infeat_ptr =
+      neureka_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left);
   task->data.outfeat_ptr = output_ptr;
   task->data.weights_ptr = weights_ptr;
   task->data.scale_ptr = scale_ptr;
diff --git a/neureka/hal/neureka_task.h b/neureka/hal/neureka_task.h
index a265223..2d06468 100644
--- a/neureka/hal/neureka_task.h
+++ b/neureka/hal/neureka_task.h
@@ -140,10 +140,10 @@ uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
                                   uint32_t i_width, uint32_t n_height,
                                   uint32_t n_width);
 uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
-                         const uint32_t channel, const uint8_t bits,
-                         const uint8_t padding_top, const uint8_t padding_left);
+                         const uint32_t width_stride, const uint8_t padding_top,
+                         const uint8_t padding_left);
 void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
-                           uint32_t w_in, uint32_t k_in, uint8_t bits_in,
+                           uint32_t w_in, uint32_t w_in_stride,
                            uint8_t padding_top, uint8_t padding_left,
                            uint32_t output_ptr, uint32_t weights_ptr,
                            uint32_t scale_ptr, uint32_t shift_ptr,
diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index 486019d..0d98ff6 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -167,8 +167,8 @@ static void task_prepare(nnx_task_t *task) {
                     PADDING_LEFT);
 #endif
 
-  nnx_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, INPUT_CHANNEL,
-                    INPUT_BITS, PADDING_TOP, PADDING_LEFT, (uint32_t)output,
+  nnx_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, w_in_stride,
+                    PADDING_TOP, PADDING_LEFT, (uint32_t)output,
                     (uint32_t)weight,
 #if HAS_NORM_QUANT == 1
                     (uint32_t)scale, NULL,