From 499fed21006fa4dc1aa2093b0fc745421402c0a3 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Sun, 14 Jan 2024 19:37:51 +0100
Subject: [PATCH] Add neureka support similar to ne16

---
 inc/pulp_nnx_neureka.h                        |  77 ++++
 neureka/bsp/neureka_siracusa_bsp.c            |  93 ++++
 neureka/bsp/neureka_siracusa_bsp.h            |  81 ++++
 neureka/gvsoc/neureka_gvsoc.h                 |  54 +++
 .../pulp_nnx_error_codes.h => hal/neureka.c}  |  27 +-
 .../{src/pulp_nnx_util.c => hal/neureka.h}    |  22 +-
 neureka/hal/neureka_task.c                    | 234 ++++++++++
 neureka/hal/neureka_task.h                    | 173 ++++++++
 neureka/hal/neureka_task_defs.h               | 114 +++++
 neureka/inc/pulp_nnx_defs.h                   | 167 -------
 neureka/inc/pulp_nnx_hal.h                    | 217 ---------
 neureka/inc/pulp_nnx_util.h                   |  27 --
 neureka/src/pulp_nnx_hal.c                    | 412 ------------------
 src/pulp_nnx_neureka.c                        | 131 ++++++
 14 files changed, 988 insertions(+), 841 deletions(-)
 create mode 100644 inc/pulp_nnx_neureka.h
 create mode 100644 neureka/bsp/neureka_siracusa_bsp.c
 create mode 100644 neureka/bsp/neureka_siracusa_bsp.h
 create mode 100644 neureka/gvsoc/neureka_gvsoc.h
 rename neureka/{inc/pulp_nnx_error_codes.h => hal/neureka.c} (53%)
 rename neureka/{src/pulp_nnx_util.c => hal/neureka.h} (62%)
 create mode 100644 neureka/hal/neureka_task.c
 create mode 100644 neureka/hal/neureka_task.h
 create mode 100644 neureka/hal/neureka_task_defs.h
 delete mode 100644 neureka/inc/pulp_nnx_defs.h
 delete mode 100644 neureka/inc/pulp_nnx_hal.h
 delete mode 100644 neureka/inc/pulp_nnx_util.h
 delete mode 100644 neureka/src/pulp_nnx_hal.c
 create mode 100644 src/pulp_nnx_neureka.c

diff --git a/inc/pulp_nnx_neureka.h b/inc/pulp_nnx_neureka.h
new file mode 100644
index 0000000..48e16f1
--- /dev/null
+++ b/inc/pulp_nnx_neureka.h
@@ -0,0 +1,77 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "neureka.h"
+#include "neureka_pulp_bsp.h"
+#include "neureka_task.h"
+#include <stdint.h>
+
+/* PULP-NNX interface */
+
+void neureka_nnx_init(neureka_dev_t *dev, neureka_pulp_conf_t *conf);
+void neureka_nnx_term(neureka_dev_t *dev);
+
+/** neureka_nnx_dispatch_check
+ *
+ * Check whether you can dispatch to the accelerator.
+ */
+int neureka_nnx_dispatch_check(neureka_dev_t *dev);
+
+/** neureka_nnx_dispatch_wait
+ *
+ * Block until you can dispatch to the accelerator.
+ */
+void neureka_nnx_dispatch_wait(neureka_dev_t *dev);
+
+/** neureka_nnx_dispatch
+ *
+ * Dispatch a task to the accelerator.
+ * Fails with return code 1 if the task cannot be dispatched. Otherwise returns 0.
+ */
+int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task);
+
+/** neureka_nnx_resolve_check
+ *
+ * Check whether the task has been resolved.
+ */
+int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task);
+
+/** neureka_nnx_resolve_wait
+ *
+ * Block until you can resolve the task.
+ */
+void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task);
+
+
+/* Additional helper functions */
+
+/** neureka_nnx_dispatch_stride2x2
+ *
+ * It uses Neureka's 2x2 strided mode which reduces the number of writes Neureka does.
+ * This mode doesn't stride the Neureka's subtile input pointer, so we have to
+ * tile the tile to the subtile's spatial dimensions (in this case 3x3 output).
+ * Works only if the k_out is divisible by 2.
+ */
+void neureka_nnx_dispatch_stride2x2(
+    neureka_dev_t *dev, neureka_task_t *task, const uint32_t w_in, const uint32_t k_in,
+    const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint8_t h_ker, const uint8_t w_ker);
diff --git a/neureka/bsp/neureka_siracusa_bsp.c b/neureka/bsp/neureka_siracusa_bsp.c
new file mode 100644
index 0000000..28deda0
--- /dev/null
+++ b/neureka/bsp/neureka_siracusa_bsp.c
@@ -0,0 +1,93 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "neureka_siracusa_bsp.h"
+#include <pmsis.h>
+
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR (0x00200000)
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS 0x18
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR                                       \
+  (NEUREKA_SIRACUSA_CLUSTER_CTRL_ADDR_BASE + NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS)
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_CG_EN 0x800
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO 0x100
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL 0xff
+#define NEUREKA_SIRACUSA_MAX_STALL (8)
+#define NEUREKA_SIRACUSA_EVENT_0 (1 << 12)
+#define NEUREKA_SIRACUSA_EVENT_1 (1 << 13)
+#define NEUREKA_SIRACUSA_BASE_ADDR (0x00201000)
+#define NEUREKA_SIRACUSA_WEIGHT_MEM_BASE_ADDR (0x10400000)
+#define NEUREKA_SIRACUSA_WEIGHT_MEM_MRAM_OFFSET (0x00000000)
+#define NEUREKA_SIRACUSA_WEIGHT_MEM_SRAM_OFFSET (0x00400000)
+
+void neureka_siracusa_cg_enable() {
+  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |=
+      NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_CG_EN;
+}
+
+void neureka_siracusa_cg_disable() {
+  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &=
+      ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_CG_EN;
+}
+
+// TODO: Check if needed for neureka
+void neureka_siracusa_hci_setpriority_neureka() {
+  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |=
+      NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO;
+}
+
+// TODO: Check if needed for neureka
+void neureka_siracusa_hci_setpriority_core() {
+  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &=
+      ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO;
+}
+
+// TODO: Check if needed for neureka
+void neureka_siracusa_hci_reset_max_stall() {
+  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &=
+      ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL;
+}
+
+// TODO: Check if needed for neureka
+void neureka_siracusa_hci_set_max_stall(uint32_t max_stall) {
+  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |=
+      max_stall & NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL;
+}
+
+void neureka_siracusa_open(neureka_siracusa_conf_t *conf) {
+  neureka_siracusa_cg_enable();
+  neureka_siracusa_hci_setpriority_neureka();
+  neureka_siracusa_hci_set_max_stall(conf->max_stall);
+}
+
+void neureka_siracusa_close() {
+  neureka_siracusa_hci_reset_max_stall();
+  neureka_siracusa_hci_setpriority_core();
+  neureka_siracusa_cg_disable();
+}
+
+void neureka_siracusa_event_wait_and_clear() {
+  eu_evt_maskWaitAndClr(NEUREKA_SIRACUSA_EVENT);
+}
+
+static const neureka_dev_t neureka_siracusa_dev = {
+    .hwpe_dev = (struct hwpe_dev_t){
+        .base_addr = (volatile uint32_t *)NEUREKA_SIRACUSA_BASE_ADDR}};
+
+const neureka_dev_t *neureka_siracusa_get_dev() { return &neureka_siracusa_dev; }
diff --git a/neureka/bsp/neureka_siracusa_bsp.h b/neureka/bsp/neureka_siracusa_bsp.h
new file mode 100644
index 0000000..9e879e8
--- /dev/null
+++ b/neureka/bsp/neureka_siracusa_bsp.h
@@ -0,0 +1,81 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NEUREKA_siracusa_BSP_H__
+#define __NEUREKA_siracusa_BSP_H__
+
+#include "neureka.h"
+#include <stdint.h>
+
+/**
+ * neureka_siracusa_cg_enable
+ *
+ * Enable clock gating of the neureka.
+ */
+void neureka_siracusa_cg_enable();
+
+/**
+ * neureka_siracusa_cg_enable
+ *
+ * Disable clock gating of the neureka.
+ */
+void neureka_siracusa_cg_disable();
+
+/**
+ * neureka_siracusa_setpriority_neureka
+ *
+ * Set HCI interconnect bus priority to prioritize neureka.
+ */
+void neureka_siracusa_hci_setpriority_neureka();
+
+/**
+ * neureka_siracusa_setpriority_core
+ *
+ * Set HCI bus priority to prioritize cores.
+ */
+void neureka_siracusa_hci_setpriority_core();
+
+/**
+ * neureka_siracusa_hci_reset_maxstall
+ *
+ * Reset the HCI bus maxstall parameter.
+ * TODO: Check if it disables it also or just resets?
+ */
+void neureka_siracusa_hci_reset_max_stall();
+
+/**
+ * neureka_siracusa_hci_set_maxstall
+ *
+ * Set the HCI bus maxstall. Maxstall defines how many cycles
+ * will the HCI bus stall the lower priority master, i.e. neureka or core,
+ * before letting it do a transaction.
+ */
+void neureka_siracusa_hci_set_max_stall(uint32_t max_stall);
+
+typedef struct neureka_siracusa_conf_t {
+  int max_stall;
+} neureka_siracusa_conf_t;
+
+void neureka_siracusa_open(neureka_siracusa_conf_t *conf);
+void neureka_siracusa_close();
+void neureka_siracusa_event_wait_and_clear();
+const neureka_dev_t *neureka_siracusa_get_dev();
+
+#endif // !__NEUREKA_siracusa_BSP_H__
diff --git a/neureka/gvsoc/neureka_gvsoc.h b/neureka/gvsoc/neureka_gvsoc.h
new file mode 100644
index 0000000..37eeab0
--- /dev/null
+++ b/neureka/gvsoc/neureka_gvsoc.h
@@ -0,0 +1,54 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NEUREKA_GVSOC_H__
+#define __NEUREKA_GVSOC_H__
+
+#include "neureka.h"
+#include "neureka_task.h"
+
+#define NEUREKA_REG_GVSOC_LOG_LEVEL 24
+#define NEUREKA_REG_GVSOC_LOG_FORMAT 25
+
+typedef enum neureka_gvsoc_log_format_e {
+  NEUREKA_GVSOC_LOG_FORMAT_DECIMAL = 0,
+  NEUREKA_GVSOC_LOG_FORMAT_HEXADECIMAL = 3
+} neureka_gvsoc_log_format_e;
+
+typedef enum neureka_gvsoc_log_level_e {
+  NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END = 0,
+  NEUREKA_GVSOC_LOG_LEVEL_CONFIG = 1,
+  NEUREKA_GVSOC_LOG_LEVEL_ACTIV_INOUT = 2,
+  NEUREKA_GVSOC_LOG_LEVEL_ALL = 3
+} neureka_gvsoc_log_level_e;
+
+static void neureka_gvsoc_log_activate(neureka_dev_t *dev,
+                                       neureka_gvsoc_log_level_e log_level,
+                                       neureka_gvsoc_log_format_e format) {
+  hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL, log_level);
+  hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_FORMAT, format);
+}
+
+static void neureka_gvsoc_log_deactivate(neureka_dev_t *dev) {
+  hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL,
+                      NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END);
+}
+
+#endif // __NEUREKA_GVSOC_H__
diff --git a/neureka/inc/pulp_nnx_error_codes.h b/neureka/hal/neureka.c
similarity index 53%
rename from neureka/inc/pulp_nnx_error_codes.h
rename to neureka/hal/neureka.c
index dc71575..ebcad93 100644
--- a/neureka/inc/pulp_nnx_error_codes.h
+++ b/neureka/hal/neureka.c
@@ -18,15 +18,22 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#ifndef __NE16_ERROR_CODES_H__
-#define __NE16_ERROR_CODES_H__
+#include "neureka.h"
 
-typedef enum {
-  success = 0,
-  weightBitwidthOutOfBounds,
-  unsupportedWeightOffsetMode,
-  unsupportedFeatureBitwidth,
-  dimensionMismatch
-} nnx_error_code;
+#define NEUREKA_STATUS_EMPTY (0x000)
+#define NEUREKA_STATUS_FULL (0x101)
 
-#endif // __NE16_ERROR_CODES_H__
\ No newline at end of file
+inline int neureka_task_queue_size(neureka_dev_t *dev) { return 2; }
+
+inline int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev) {
+  uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev);
+  return (status & 0x1) + ((status >> 8) & 0x1);
+}
+
+inline int neureka_task_queue_empty(neureka_dev_t *dev) {
+  return hwpe_task_queue_status(&dev->hwpe_dev) == NEUREKA_STATUS_EMPTY;
+}
+
+inline int neureka_task_queue_full(neureka_dev_t *dev) {
+  return hwpe_task_queue_status(&dev->hwpe_dev) == NEUREKA_STATUS_FULL;
+}
diff --git a/neureka/src/pulp_nnx_util.c b/neureka/hal/neureka.h
similarity index 62%
rename from neureka/src/pulp_nnx_util.c
rename to neureka/hal/neureka.h
index daaaf2b..887d995 100644
--- a/neureka/src/pulp_nnx_util.c
+++ b/neureka/hal/neureka.h
@@ -18,13 +18,19 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "pulp_nnx_util.h"
-#include "pulp_nnx_hal.h"
+#ifndef __NEUREKA_H__
+#define __NEUREKA_H__
 
-void nnx_activate_gvsoc_logging(int log_level) {
-  NEUREKA_WRITE_IO_REG(NEUREKA_REG_GVSOC_TRACE, log_level);
-}
+#include "hwpe.h"
+#include <stdint.h>
 
-void nnx_deactivate_gvsoc_logging() {
-  NEUREKA_WRITE_IO_REG(NEUREKA_REG_GVSOC_TRACE, 0);
-}
+typedef struct neureka_dev_t {
+  hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */
+} neureka_dev_t;
+
+int neureka_task_queue_size(neureka_dev_t *dev);
+int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev);
+int neureka_task_queue_empty(neureka_dev_t *dev);
+int neureka_task_queue_full(neureka_dev_t *dev);
+
+#endif // __NEUREKA_H__
diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
new file mode 100644
index 0000000..943c373
--- /dev/null
+++ b/neureka/hal/neureka_task.c
@@ -0,0 +1,234 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "neureka_task.h"
+#include "neureka_task_defs.h"
+#include "pulp_nnx_util.h"
+
+inline uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
+                                      uint32_t i_width, uint32_t n_height,
+                                      uint32_t n_width) {
+  uint32_t tile_padding = padding;
+  if (i_height > 0) {
+    tile_padding &= ~(0xf << 28);
+  }
+  if (i_width < n_width - 1) {
+    tile_padding &= ~(0xf << 24);
+  }
+  if (i_height < n_height - 1) {
+    tile_padding &= ~(0xf << 20);
+  }
+  if (i_width > 0) {
+    tile_padding &= ~(0xf << 16);
+  }
+  return tile_padding;
+}
+
+void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
+                       const uint8_t depthwise, const uint8_t input_bits,
+                       const uint8_t output_bits, const uint8_t weights_bits,
+                       const neureka_weight_offset_mode_e weights_offset_mode,
+                       const uint32_t weights_offset_factor, neureka_quant_t quant,
+                       neureka_norm_t norm, const uint8_t stride) {
+  const uint32_t flag_mode16 =
+      input_bits == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC;
+
+  *task = (neureka_task_t){
+      .outbytes = output_bits / 8,
+      .weight_d0_stride = flag_mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16
+                                      : kernel_shape == 3 ? 
+                                        NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 :
+                                        NEUREKA_WEIGHT_D0_STRIDE_MODE8,
+      .qw = weights_bits,
+      .stride_shift = stride == 2 ? 1 : 0,
+      .output_channel_throughput = depthwise ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3
+                                             : NEUREKA_OUTPUT_CHANNEL_THROUGHPUT,
+      .input_channel_throughput = kernel_shape == 3 ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3
+                                                    : NEUREKA_INPUT_CHANNEL_THROUGHPUT,
+      .kernel_shape = kernel_shape,
+      .depthwise = depthwise,
+      .data = {0}};
+
+  const int flag_stride2x2 = stride == 2 ? NEUREKA_FLAG_STRIDE_2x2 : 0;
+
+  const int flag_mode = kernel_shape == 1 ? NEUREKA_FLAG_MODE_1x1
+                        : depthwise == 1  ? NEUREKA_FLAG_MODE_3x3_DW
+                                          : NEUREKA_FLAG_MODE_3x3;
+
+  task->data.cfg.conf0 |=
+      NEUREKA_FLAG_NORM_QUANT | quant.function | quant.mode |
+      (quant.shift_amount << 16) | quant.flag_rounding << NEUREKA_SHIFT_ROUNDING |
+      norm.mode | norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS |
+      norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT | weights_offset_mode |
+      flag_mode | flag_mode16 | (weights_bits - 1) | flag_stride2x2;
+
+  task->data.cfg.weight_offset_factor = weights_offset_factor;
+}
+
+/** neureka_pad_ptr
+ *
+ * Calculate the pointer to the start of the ptr as if
+ * it was the start to the padded data.
+ * Necessary for input pointer when it's padded.
+ */
+inline uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
+                             const uint32_t channel, const uint8_t bits,
+                             const uint8_t padding_top,
+                             const uint8_t padding_left) {
+  return ptr - (padding_top * width + padding_left) * channel * bits / 8;
+}
+
+inline void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
+                               uint32_t w_in, uint32_t k_in, uint8_t bits_in,
+                               uint8_t padding_top, uint8_t padding_left,
+                               uint32_t output_ptr, uint32_t weights_ptr,
+                               uint32_t scale_ptr, uint32_t shift_ptr,
+                               uint32_t bias_ptr) {
+  task->data.infeat_ptr =
+      neureka_pad_ptr(input_ptr, w_in, k_in, bits_in, padding_top, padding_left);
+  task->data.outfeat_ptr = output_ptr;
+  task->data.weights_ptr = weights_ptr;
+  task->data.scale_ptr = scale_ptr;
+  task->data.scale_shift_ptr = shift_ptr;
+  task->data.scale_bias_ptr = bias_ptr;
+}
+
+void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
+                           const uint32_t w_in_stride,
+                           const uint32_t k_in_stride,
+                           const uint32_t w_out_stride,
+                           const uint32_t k_out_stride) {
+  const uint32_t num_k_in = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT);
+
+  const neureka_stride_t input_stride = {
+      .d0 = k_in_stride,
+      .d1 = k_in_stride * w_in_stride,
+      .d2 = task->depthwise ? 0 : 
+            task->kernel_shape == 1 ? k_in_stride * 3 * 3 : // TODO: Check this magic
+              k_in_stride * NEUREKA_FILTER_BUFFER_SIZE * NEUREKA_FILTER_BUFFER_SIZE};
+  task->data.cfg.input_stride = input_stride;
+
+  // WARNING: Stride works only for even output channel sizes (divisible by 2)
+  const neureka_stride_t output_stride = {
+      .d0 = 32,
+      .d1 = (k_out_stride * task->outbytes) >> task->stride_shift,
+      .d2 =
+          (k_out_stride * task->outbytes * w_out_stride) >> task->stride_shift};
+  task->data.cfg.output_stride = output_stride;
+
+  if (task->kernel_shape == 1) {
+    task->data.cfg.weights_stride.d0 = task->weight_d0_stride * task->qw;
+    task->data.cfg.weights_stride.d1 =
+        task->weight_d0_stride * task->qw * num_k_in;
+    task->data.cfg.weights_stride.d2 = 0;
+  } else if (!task->depthwise) {
+    task->data.cfg.weights_stride.d0 = task->weight_d0_stride;
+    task->data.cfg.weights_stride.d1 = task->weight_d0_stride * task->qw *
+                                       num_k_in;
+    task->data.cfg.weights_stride.d2 = 0;
+  } else {
+    task->data.cfg.weights_stride.d0 =
+        NEUREKA_FILTER_SIZE * NEUREKA_FILTER_SIZE * task->weight_d0_stride;
+    task->data.cfg.weights_stride.d1 = 0;
+    task->data.cfg.weights_stride.d2 = 0;
+  }
+}
+
+void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
+                            const uint32_t h_out, const uint32_t w_out,
+                            const uint32_t k_out, const uint8_t padding_bottom,
+                            const uint8_t padding_right) {
+  const uint16_t num_Ko = divnceil(k_out, task->output_channel_throughput);
+  const uint16_t num_Ki = divnceil(k_in, task->input_channel_throughput);
+  const uint16_t num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE);
+  const uint16_t num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE);
+
+  const uint16_t rem_Ko = remainder(k_out, task->output_channel_throughput);
+  const uint16_t rem_Ki = remainder(k_in, task->input_channel_throughput);
+  const uint16_t rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE);
+  const uint16_t rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE);
+  const uint16_t rem_Hi =
+      (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom; // TODO: Check padding bottom
+  const uint16_t rem_Wi =
+      (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right; // TODO: Check padding right
+
+  const neureka_subtile_t subtile = {
+      .number = {.KoKi = concat_half(num_Ko, num_Ki),
+                 .HoWo = concat_half(num_Ho, num_Wo)},
+      .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
+                    .HoWo = concat_half(rem_Ho, rem_Wo),
+                    .HiWi = concat_half(rem_Hi, rem_Wi)}};
+  task->data.cfg.subtile = subtile;
+}
+
+inline void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
+                                  const uint8_t bottom, const uint8_t left,
+                                  const uint8_t right, const uint8_t value) {
+  task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) |
+                           ((bottom & 0xf) << 20) | ((left & 0xf) << 16) |
+                           (value & 0xff);
+}
+
+inline void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
+                                      const uint8_t right, const uint8_t bottom,
+                                      const uint8_t left) {
+  task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
+                               ((bottom & 0xff) << 8) | ((left & 0xff) << 0);
+}
+
+void neureka_task_set_dims(neureka_task_t *task, const uint32_t w_in,
+                        const uint32_t k_in, const uint32_t w_in_stride,
+                        const uint32_t k_in_stride, const uint32_t h_out,
+                        const uint32_t w_out, const uint32_t k_out,
+                        const uint32_t w_out_stride, const uint32_t k_out_stride,
+                        const uint8_t padding_top, const uint8_t padding_bottom,
+                        const uint8_t padding_right,
+                        const uint8_t padding_left) {
+  neureka_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
+                        k_out_stride);
+  neureka_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,
+                         padding_right);
+  neureka_task_set_padding(task, padding_top, padding_bottom, padding_left,
+                        padding_right, 0);
+}
+
+void neureka_task_set_dims_stride2x2(
+    neureka_task_t *task, const uint32_t h_in, const uint32_t w_in,
+    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
+    const uint8_t padding_bottom, const uint8_t padding_right,
+    const uint8_t padding_left) {
+  const uint8_t stride = 2;
+
+  neureka_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
+                        k_out_stride);
+  neureka_task_set_counters(task, k_in, h_out > 1 ? 3 : 1, w_out > 1 ? 3 : 1,
+                         k_out, h_in + padding_top >= 5 ? 0 : padding_bottom, 0);
+
+  const uint8_t padding_bottom_new =
+      (h_in + padding_top - h_ker) % stride == 0 ? 0 : padding_bottom;
+  const uint8_t padding_right_new =
+      (w_in + padding_left - w_ker) % stride == 0 ? 0 : padding_right;
+
+  neureka_task_set_padding(task, padding_top, padding_bottom_new, padding_left,
+                        padding_right_new, 0);
+}
diff --git a/neureka/hal/neureka_task.h b/neureka/hal/neureka_task.h
new file mode 100644
index 0000000..7f4c31b
--- /dev/null
+++ b/neureka/hal/neureka_task.h
@@ -0,0 +1,173 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NEUREKA_TASK_H__
+#define __NEUREKA_TASK_H__
+
+#include "neureka_task_defs.h"
+#include <stdint.h>
+
+typedef enum neureka_task_flag_e {
+  neurekaTaskFlagFalse = 0,
+  neurekaTaskFlagTrue = 1
+} neureka_task_flag_e;
+
+typedef enum neureka_weight_offset_mode_e {
+  weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC,
+  weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE
+} neureka_weight_offset_mode_e;
+
+typedef enum {
+  normMode8Bit = NEUREKA_NORM_MODE_8BIT,
+  normMode16Bit = NEUREKA_NORM_MODE_16BIT,
+  normMode32Bit = NEUREKA_NORM_MODE_32BIT
+} neureka_norm_mode_e;
+
+typedef struct neureka_norm_t {
+  neureka_norm_mode_e mode;
+  int flag_bias;
+  int flag_shift;
+} neureka_norm_t;
+
+typedef enum neureka_quant_mode_e {
+  quantMode8Bit = NEUREKA_QUANT_MODE_8BIT,
+  quantMode16Bit = NEUREKA_QUANT_MODE_16BIT,
+  quantMode32Bit = NEUREKA_QUANT_MODE_32BIT
+} neureka_quant_mode_e;
+
+typedef enum neureka_quant_function_e {
+  quantFunctionIdentity = NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY,
+  quantFunctionRelu = NEUREKA_FLAG_QUANT_FUNCTION_RELU
+} neureka_quant_function_e;
+
+typedef struct neureka_quant_t {
+  // Shift amount must be in range 0x00-0x1F
+  unsigned shift_amount;
+  neureka_quant_mode_e mode;
+  neureka_quant_function_e function;
+  int flag_rounding;
+} neureka_quant_t;
+
+typedef struct neureka_stride_t {
+  uint32_t d0;
+  uint32_t d1;
+  uint32_t d2;
+} neureka_stride_t;
+
+typedef struct neureka_subtile_remainder_t {
+  uint32_t KoKi;
+  uint32_t HoWo;
+  uint32_t HiWi;
+} neureka_subtile_remainder_t;
+
+typedef struct neureka_subtile_number_t {
+  uint32_t KoKi;
+  uint32_t HoWo;
+} neureka_subtile_number_t;
+
+typedef struct neureka_subtile_t {
+  neureka_subtile_remainder_t remainder;
+  neureka_subtile_number_t number;
+} neureka_subtile_t;
+
+typedef struct neureka_cfg_t {
+  neureka_stride_t input_stride;
+  neureka_stride_t output_stride;
+  neureka_stride_t weights_stride;
+  neureka_subtile_t subtile;
+  uint32_t padding;
+  uint32_t weight_offset_factor;
+  uint32_t filter_mask;
+  uint32_t conf0;
+} neureka_cfg_t;
+
+typedef struct neureka_task_data_t {
+  uint32_t weights_ptr;
+  uint32_t infeat_ptr;
+  uint32_t outfeat_ptr;
+  uint32_t scale_ptr;
+  uint32_t scale_shift_ptr;
+  uint32_t scale_bias_ptr;
+  neureka_cfg_t cfg;
+} neureka_task_data_t;
+
+typedef struct neureka_task_t {
+  neureka_task_data_t data;
+  uint8_t outbytes;
+  uint8_t weight_d0_stride;
+  uint8_t qw;
+  uint8_t stride_shift;
+  uint8_t output_channel_throughput;
+  uint8_t input_channel_throughput;
+  uint8_t kernel_shape;
+  uint8_t depthwise;
+  uint8_t id;
+} neureka_task_t;
+
+void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape,
+                    const uint8_t depthwise, const uint8_t input_bits,
+                    const uint8_t output_bits, const uint8_t weights_bits,
+                    const neureka_weight_offset_mode_e weights_offset_mode,
+                    const uint32_t weights_offset_factor, neureka_quant_t quant,
+                    neureka_norm_t norm, const uint8_t stride);
+uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
+                               uint32_t i_width, uint32_t n_height,
+                               uint32_t n_width);
+uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
+                      const uint32_t channel, const uint8_t bits,
+                      const uint8_t padding_top, const uint8_t padding_left);
+void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr, uint32_t w_in,
+                        uint32_t k_in, uint8_t bits_in, uint8_t padding_top,
+                        uint8_t padding_left, uint32_t output_ptr,
+                        uint32_t weights_ptr, uint32_t scale_ptr,
+                        uint32_t shift_ptr, uint32_t bias_ptr);
+void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
+                           const uint32_t w_in_stride,
+                           const uint32_t k_in_stride,
+                           const uint32_t w_out_stride,
+                           const uint32_t k_out_stride);
+void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
+                            const uint32_t h_out, const uint32_t w_out,
+                            const uint32_t k_out, const uint8_t padding_bottom,
+                            const uint8_t padding_right);
+void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
+                           const uint8_t bottom, const uint8_t left,
+                           const uint8_t right, const uint8_t value);
+void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
+                               const uint8_t right, const uint8_t bottom,
+                               const uint8_t left);
+void neureka_task_set_dims(neureka_task_t *task, const uint32_t w_in,
+                        const uint32_t k_in, const uint32_t w_in_stride,
+                        const uint32_t k_in_stride, const uint32_t h_out,
+                        const uint32_t w_out, const uint32_t k_out,
+                        const uint32_t w_out_stride, const uint32_t k_out_stride,
+                        const uint8_t padding_top, const uint8_t padding_bottom,
+                        const uint8_t padding_right,
+                        const uint8_t padding_left);
+void neureka_task_set_dims_stride2x2(
+    neureka_task_t *task, const uint32_t h_in, const uint32_t w_in,
+    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
+    const uint8_t padding_bottom, const uint8_t padding_right,
+    const uint8_t padding_left);
+
+#endif // !__NEUREKA_TASK_H__
diff --git a/neureka/hal/neureka_task_defs.h b/neureka/hal/neureka_task_defs.h
new file mode 100644
index 0000000..daa9897
--- /dev/null
+++ b/neureka/hal/neureka_task_defs.h
@@ -0,0 +1,114 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NEUREKA_DEFS_H__
+#define __NEUREKA_DEFS_H__
+
+/* ARHITECTURE */
+
+#define NEUREKA_FILTER_SIZE (6)
+#define NEUREKA_FILTER_BUFFER_SIZE (8)
+#define NEUREKA_INPUT_CHANNEL_THROUGHPUT (32)
+#define NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 (28)
+#define NEUREKA_OUTPUT_CHANNEL_THROUGHPUT (32)
+#define NEUREKA_WEIGHT_BANDWIDTH (256)
+
+#define NEUREKA_WEIGHT_D0_STRIDE_MODE8 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 8)
+#define NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 (NEUREKA_WEIGHT_BANDWIDTH / 8)
+#define NEUREKA_WEIGHT_D0_STRIDE_MODE16 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 16)
+
+/* TASK REGISTERS */
+
+// job configuration
+#define NEUREKA_REG_WEIGHTS_PTR 0
+#define NEUREKA_REG_INFEAT_PTR 1
+#define NEUREKA_REG_OUTFEAT_PTR 2
+#define NEUREKA_REG_SCALE_PTR 3
+#define NEUREKA_REG_SCALE_SHIFT_PTR 4
+#define NEUREKA_REG_SCALE_BIAS_PTR 5
+#define NEUREKA_REG_INFEAT_D0_STRIDE 6
+#define NEUREKA_REG_INFEAT_D1_STRIDE 7
+#define NEUREKA_REG_INFEAT_D2_STRIDE 8
+#define NEUREKA_REG_OUTFEAT_D0_STRIDE 9
+#define NEUREKA_REG_OUTFEAT_D1_STRIDE 10
+#define NEUREKA_REG_OUTFEAT_D2_STRIDE 11
+#define NEUREKA_REG_WEIGHTS_D0_STRIDE 12
+#define NEUREKA_REG_WEIGHTS_D1_STRIDE 13
+#define NEUREKA_REG_WEIGHTS_D2_STRIDE 14
+#define NEUREKA_REG_SUBTILE_REMAINDER_0 15
+#define NEUREKA_REG_SUBTILE_REMAINDER_1 16
+#define NEUREKA_REG_SUBTILE_REMAINDER_2 17
+#define NEUREKA_REG_SUBTILE_NUMBER_0 18
+#define NEUREKA_REG_SUBTILE_NUMBER_1 19
+#define NEUREKA_REG_PADDING 20
+#define NEUREKA_REG_WEIGHT_OFFSET_FACTOR 21
+#define NEUREKA_REG_FILTER_MASKING 22
+#define NEUREKA_REG_CONF0 23
+
+/*  SHIFT  */
+
+#define NEUREKA_SHIFT_FLAG_NORM_BIAS (25)
+#define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24)
+#define NEUREKA_SHIFT_QUANT_SHIFT (16)
+#define NEUREKA_SHIFT_ROUNDING (11)
+
+/*  CONF0 FLAGS */
+
+#define NEUREKA_FLAG_NORM_BIAS (1 << 25)
+#define NEUREKA_FLAG_NORM_SHIFT (1 << 24)
+#define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23)
+#define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23)
+#define NEUREKA_QUANT_MODE_8BIT (0 << 21)
+#define NEUREKA_QUANT_MODE_16BIT (1 << 21)
+#define NEUREKA_QUANT_MODE_32BIT (2 << 21)
+// conf0[20:16] - quantization shift amount
+#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15)
+#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15)
+#define NEUREKA_FLAG_STREAMIN (1 << 14)
+#define NEUREKA_NORM_MODE_8BIT (0 << 12)
+#define NEUREKA_NORM_MODE_16BIT (1 << 12)
+#define NEUREKA_NORM_MODE_32BIT (2 << 12)
+#define NEUREKA_FLAG_ROUND (1 << 11)
+#define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10)
+#define NEUREKA_FLAG_USE_WMEM (1 << 9)
+#define NEUREKA_FLAG_USE_TCDM (0 << 9)
+#define NEUREKA_FLAG_STRIDE_2x2 (1 << 8) // TODO: Check if the `STRIDED` mode is still `STRIDE_2x2`
+#define NEUREKA_FLAG_LINEAR_MODE (1 << 7)
+#define NEUREKA_FLAG_MODE_3x3 (0 << 5)
+#define NEUREKA_FLAG_MODE_3x3_DW (1 << 5)
+#define NEUREKA_FLAG_MODE_1x1 (2 << 5)
+#define NEUREKA_FLAG_NORM_QUANT (1 << 4)
+#define NEUREKA_FLAG_MODE_BASIC (0 << 3)
+#define NEUREKA_FLAG_MODE16 (1 << 3)
+
+/* Masks */
+
+#define NEUREKA_MASK_QUANT_FUNCTION (1 << 23)
+#define NEUREKA_MASK_QUANT_MODE (3 << 21)
+
+/* PADDING */
+
+#define NEUREKA_DONT_PAD (0)
+#define NEUREKA_MAX_PAD (2)
+
+/* NORM */
+#define NEUREKA_NORM_MAX_LEN (32)
+
+#endif // __NEUREKA_DEFS_H__
diff --git a/neureka/inc/pulp_nnx_defs.h b/neureka/inc/pulp_nnx_defs.h
deleted file mode 100644
index e8ecba5..0000000
--- a/neureka/inc/pulp_nnx_defs.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Luka Macan <luka.macan@unibo.it>
- * Arpan Prasad <prasadar@iis.ee.ethz.ch>
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef __NEUREKA_DEFS_H__
-#define __NEUREKA_DEFS_H__
-
-/* ARHITECTURE */
-
-#define NEUREKA_FILTER_SIZE (6)
-#define NEUREKA_FILTER_BUFFER_SIZE (8)
-#define NEUREKA_INPUT_CHANNEL_THROUGHPUT (32)
-#define NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 (28)
-#define NEUREKA_OUTPUT_CHANNEL_THROUGHPUT (32)
-#define NEUREKA_CONTEXT_SIZE (2)
-#define NEUREKA_WEIGHT_BANDWIDTH (256)
-
-#define NEUREKA_WEIGHT_D0_STRIDE_MODE8 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 8)
-#define NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 (NEUREKA_WEIGHT_BANDWIDTH / 8)
-#define NEUREKA_WEIGHT_D0_STRIDE_MODE16 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 16)
-
-/* REGISTER MAP */
-
-#define NEUREKA_EVT0 12
-#define NEUREKA_EVT1 13
-#define NEUREKA_BASE_ADDR 0x00201000
-#define WEIGHT_MEM_BASE 0x10400000
-#define SRAM_OFFSET 0x00400000
-#define MRAM_OFFSET 0x00000000
-
-// Cluster
-#define CLUSTER_CTRL_BASE_ADDR 0x00200000
-#define CLUSTER_CTRL_HWPE_OFFS 0x18
-#define CLUSTER_CTRL_HWPE_CG_EN_MASK 0x800
-
-/* REGISTER OFFSETS */
-
-// commands
-#define NEUREKA_TRIGGER 0x00
-#define NEUREKA_ACQUIRE 0x04
-#define NEUREKA_FINISHED 0x08
-#define NEUREKA_STATUS 0x0C
-#define NEUREKA_RUNNING_JOB 0x10
-#define NEUREKA_SOFT_CLEAR 0x14
-#define NEUREKA_SWSYNC 0x18
-#define NEUREKA_URISCY_IMEM 0x1C
-
-// job configuration
-#define NEUREKA_REGISTER_OFFSET 0x20
-
-#define NEUREKA_REG_WEIGHTS_PTR 0x00
-#define NEUREKA_REG_INFEAT_PTR 0x04
-#define NEUREKA_REG_OUTFEAT_PTR 0x08
-#define NEUREKA_REG_SCALE_PTR 0x0C
-#define NEUREKA_REG_SCALE_SHIFT_PTR 0x10
-#define NEUREKA_REG_SCALE_BIAS_PTR 0x14
-#define NEUREKA_REG_INFEAT_D0_STRIDE 0x18
-#define NEUREKA_REG_INFEAT_D1_STRIDE 0x1C
-#define NEUREKA_REG_INFEAT_D2_STRIDE 0x20
-#define NEUREKA_REG_OUTFEAT_D0_STRIDE 0x24
-#define NEUREKA_REG_OUTFEAT_D1_STRIDE 0x28
-#define NEUREKA_REG_OUTFEAT_D2_STRIDE 0x2C
-#define NEUREKA_REG_WEIGHTS_D0_STRIDE 0x30
-#define NEUREKA_REG_WEIGHTS_D1_STRIDE 0x34
-#define NEUREKA_REG_WEIGHTS_D2_STRIDE 0x38
-#define NEUREKA_REG_SUBTILE_REMAINDER_0 0x3C
-#define NEUREKA_REG_SUBTILE_REMAINDER_1 0x40
-#define NEUREKA_REG_SUBTILE_REMAINDER_2 0x44
-#define NEUREKA_REG_SUBTILE_NUMBER_0 0x48
-#define NEUREKA_REG_SUBTILE_NUMBER_1 0x4C
-#define NEUREKA_REG_PADDING 0x50
-#define NEUREKA_REG_WEIGHT_OFFSET_FACTOR 0x54
-#define NEUREKA_REG_FILTER_MASKING 0x58
-#define NEUREKA_REG_CONF0 0x5C
-
-// Simulation only
-#define NEUREKA_REG_GVSOC_TRACE 0x60
-
-/*  SHIFT  */
-
-#define NEUREKA_SHIFT_FLAG_NORM_BIAS (25)
-#define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24)
-#define NEUREKA_SHIFT_QUANT_SHIFT (16)
-#define NEUREKA_SHIFT_ROUNDING (11)
-
-/*  CONF0 FLAGS */
-
-#define NEUREKA_FLAG_NORM_BIAS (1 << 25)
-#define NEUREKA_FLAG_NORM_SHIFT (1 << 24)
-#define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23)
-#define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23)
-#define NEUREKA_QUANT_MODE_8BIT (0 << 21)
-#define NEUREKA_QUANT_MODE_16BIT (1 << 21)
-#define NEUREKA_QUANT_MODE_32BIT (2 << 21)
-// conf0[20:16] - quantization shift amount
-#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15)
-#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15)
-#define NEUREKA_FLAG_STREAMIN (1 << 14)
-#define NEUREKA_NORM_MODE_8BIT (0 << 12)
-#define NEUREKA_NORM_MODE_16BIT (1 << 12)
-#define NEUREKA_NORM_MODE_32BIT (2 << 12)
-#define NEUREKA_FLAG_ROUND (1 << 11)
-#define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10)
-#define NEUREKA_FLAG_USE_WMEM (1 << 9)
-#define NEUREKA_FLAG_USE_TCDM (0 << 9)
-#define NEUREKA_FLAG_STRIDED_MODE (1 << 8)
-#define NEUREKA_FLAG_LINEAR_MODE (1 << 7)
-#define NEUREKA_FLAG_MODE_3x3 (0 << 5)
-#define NEUREKA_FLAG_MODE_3x3_DW (1 << 5)
-#define NEUREKA_FLAG_MODE_1x1 (2 << 5)
-#define NEUREKA_FLAG_NORM_QUANT (1 << 4)
-#define NEUREKA_FLAG_MODE_BASIC (0 << 3)
-#define NEUREKA_FLAG_MODE16 (1 << 3)
-
-/* Masks */
-
-#define NEUREKA_MASK_QUANT_FUNCTION (1 << 23)
-#define NEUREKA_MASK_QUANT_MODE (3 << 21)
-
-/* Miscellaneous */
-
-// Padding
-#define MAX_PAD (0xf)
-
-// Normalization
-#define NEUREKA_NORM_MAX_LEN (32)
-#define NO_NORM(length)                                                        \
-  {                                                                            \
-    .scale = scale_identity, .bias = NEUREKA_NULL, .shift = NEUREKA_NULL,      \
-    .length = length, .mode = normMode32Bit                                    \
-  }
-
-// Quantization
-#define NO_QUANT                                                               \
-  {                                                                            \
-    .shift_amount = 0, .mode = quantMode32Bit,                                 \
-    .function = quantFunctionIdentity                                          \
-  }
-
-// GVSOC trace levels
-#define NEUREKA_TRACE_LEVEL_JOB_START_END 0
-#define NEUREKA_TRACE_LEVEL_CONFIG 1
-#define NEUREKA_TRACE_LEVEL_ACTIV_INOUT 2
-#define NEUREKA_TRACE_LEVEL_ALL 3
-
-// null
-#define NEUREKA_NULL ((void *)0)
-#define NEUREKA_STATUS_FULL (0x101)
-
-#endif // __NEUREKA_DEFS_H__
diff --git a/neureka/inc/pulp_nnx_hal.h b/neureka/inc/pulp_nnx_hal.h
deleted file mode 100644
index 40bcec0..0000000
--- a/neureka/inc/pulp_nnx_hal.h
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Luka Macan <luka.macan@unibo.it>
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef __NEUREKA_H__
-#define __NEUREKA_H__
-
-#include <stdint.h>
-
-#include "pulp_nnx_defs.h"
-#include "pulp_nnx_error_codes.h"
-
-#define NEUREKA_CG_ENABLE()                                                    \
-  *(volatile int *)(CLUSTER_CTRL_BASE_ADDR + CLUSTER_CTRL_HWPE_OFFS) |=        \
-      CLUSTER_CTRL_HWPE_CG_EN_MASK
-#define NEUREKA_CG_DISABLE()                                                   \
-  *(volatile int *)(CLUSTER_CTRL_BASE_ADDR + CLUSTER_CTRL_HWPE_OFFS) &=        \
-      ~CLUSTER_CTRL_HWPE_CG_EN_MASK
-
-#define NEUREKA_WRITE(offset, value)                                           \
-  *(int volatile *)(NEUREKA_BASE_ADDR + (offset)) = (value)
-#define NEUREKA_WRITE_BE(offset, value, be)                                    \
-  *(char volatile *)(NEUREKA_BASE_ADDR + (offset) + (be)) = (value)
-#define NEUREKA_READ(offset) *(int volatile *)(NEUREKA_BASE_ADDR + (offset))
-
-#define NEUREKA_WRITE_IO_REG(offset, value)                                    \
-  NEUREKA_WRITE(NEUREKA_REGISTER_OFFSET + (offset), (value))
-#define NEUREKA_WRITE_IO_REG_BE(offset, value, be)                             \
-  NEUREKA_WRITE_BE(NEUREKA_REGISTER_OFFSET + (offset), (value), (be))
-#define NEUREKA_READ_IO_REG(offset)                                            \
-  NEUREKA_READ(NEUREKA_REGISTER_OFFSET + (offset))
-
-#define NEUREKA_BARRIER_NOSTATUS() eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0)
-#define NEUREKA_BARRIER()                                                      \
-  do {                                                                         \
-    eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0);                                  \
-  } while ((*(int volatile *)(NEUREKA_BASE_ADDR + NEUREKA_STATUS)) != 0)
-#define NEUREKA_BUSYWAIT()                                                     \
-  do {                                                                         \
-  } while ((*(int volatile *)(NEUREKA_BASE_ADDR + NEUREKA_STATUS)) != 0)
-#define NEUREKA_BARRIER_ACQUIRE(job_id)                                        \
-  job_id = NEUREKA_READ(NEUREKA_ACQUIRE);                                      \
-  while (job_id < 0) {                                                         \
-    eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0);                                  \
-    job_id = NEUREKA_READ(NEUREKA_ACQUIRE);                                    \
-  };
-#define NEUREKA_NOBARRIER_ACQUIRE(job_id)                                      \
-  job_id = NEUREKA_READ(NEUREKA_ACQUIRE);                                      \
-  while (job_id < 0) {                                                         \
-    job_id = NEUREKA_READ(NEUREKA_ACQUIRE);                                    \
-  };
-
-#define DIVNCEIL(A, B) (((A - 1) / B) + 1)
-#define REMAINDER(A, B) (((A - 1) % B) + 1)
-#define CONCAT_HALF(A, B) (((A & 0xffff) << 16) | (B & 0xffff))
-
-#define NNX_CONTEXT_SIZE NEUREKA_CONTEXT_SIZE
-
-#define FLAG_USED (1)
-#define FLAG_UNUSED (0)
-
-typedef enum {
-  weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC,
-  weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE
-} nnx_weight_offset_mode_e;
-
-typedef struct {
-  void *data;
-  uint16_t height;
-  uint16_t width;
-  uint16_t depth;
-  uint16_t n_weights;
-  uint32_t bitwidth;
-  int32_t offset_factor;
-  nnx_weight_offset_mode_e offset_mode;
-} nnx_weights_t;
-
-typedef enum {
-  featureBitwidth8Bit = 8,
-  featureBitwidth16Bit = 16,
-  featureBitwidth32Bit = 32
-} nnx_feature_bitwidth_e;
-
-typedef struct {
-  void *data;
-  uint16_t height;
-  uint16_t width;
-  uint16_t depth;
-  nnx_feature_bitwidth_e bitwidth;
-} nnx_feature_t;
-
-typedef enum {
-  normMode8Bit = NEUREKA_NORM_MODE_8BIT,
-  normMode16Bit = NEUREKA_NORM_MODE_16BIT,
-  normMode32Bit = NEUREKA_NORM_MODE_32BIT
-} nnx_norm_mode_e;
-
-typedef struct {
-  nnx_norm_mode_e mode;
-  int flag_bias;
-  int flag_shift;
-} nnx_norm_t;
-
-typedef enum {
-  quantMode8Bit = NEUREKA_QUANT_MODE_8BIT,
-  quantMode16Bit = NEUREKA_QUANT_MODE_16BIT,
-  quantMode32Bit = NEUREKA_QUANT_MODE_32BIT
-} nnx_quant_mode_e;
-
-typedef enum {
-  quantFunctionIdentity = NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY,
-  quantFunctionRelu = NEUREKA_FLAG_QUANT_FUNCTION_RELU
-} nnx_quant_function_e;
-
-// TODO: add rounding to quant. Should also be an enum? Best boolean...
-typedef struct {
-  // Shift amount must be in range 0x00-0x1F
-  unsigned shift_amount;
-  nnx_quant_mode_e mode;
-  nnx_quant_function_e function;
-  int flag_rounding;
-} nnx_quant_t;
-
-typedef struct {
-  uint32_t d0;
-  uint32_t d1;
-  uint32_t d2;
-} nnx_stride_t;
-
-typedef struct {
-  uint32_t KoKi;
-  uint32_t HoWo;
-  uint32_t HiWi;
-} nnx_subtile_remainder_t;
-
-typedef struct {
-  uint32_t KoKi;
-  uint32_t HoWo;
-} nnx_subtile_number_t;
-
-typedef struct {
-  nnx_subtile_remainder_t remainder;
-  nnx_subtile_number_t number;
-} nnx_subtile_t;
-
-typedef struct {
-  nnx_stride_t input_stride;
-  nnx_stride_t output_stride;
-  nnx_stride_t weights_stride;
-  nnx_subtile_t subtile;
-  uint32_t padding;
-  uint32_t weight_offset_factor;
-  uint32_t filter_mask;
-  uint32_t conf0;
-} nnx_cfg_t;
-
-typedef struct {
-  uint32_t weights_ptr;
-  uint32_t infeat_ptr;
-  uint32_t outfeat_ptr;
-  uint32_t scale_ptr;
-  uint32_t scale_shift_ptr;
-  uint32_t scale_bias_ptr;
-  nnx_cfg_t cfg;
-} nnx_task_t;
-
-int nnx_job_id();
-int nnx_empty();
-int nnx_full();
-void nnx_soft_clear();
-int nnx_acquire();
-void nnx_offload(nnx_task_t *task);
-void nnx_offload_ptr(nnx_task_t *task);
-void nnx_run_async();
-void nnx_run_blocking();
-void nnx_commit();
-void nnx_wait_empty();
-void nnx_wait_not_full();
-void nnx_wait_on_id(int id);
-void nnx_busywait();
-
-void nnx_task_init(nnx_task_t *task);
-int nnx_pad_input(nnx_cfg_t *cfg, uint32_t top, uint32_t right, uint32_t bottom,
-                  uint32_t left, uint16_t value);
-int nnx_norm_quant(nnx_cfg_t *cfg, nnx_norm_t norm, nnx_quant_t quant);
-void nnx_mask_filter(nnx_cfg_t *cfg, uint8_t top, uint8_t right, uint8_t bottom,
-                     uint8_t left);
-nnx_error_code nnx_conv_1x1(nnx_cfg_t *cfg, nnx_weights_t weights,
-                            nnx_feature_t input, nnx_feature_t output);
-nnx_error_code nnx_conv_1x1_update_dims(nnx_cfg_t *cfg, int h_out, int w_out,
-                                        int k_out, int k_in);
-nnx_error_code nnx_conv_3x3(nnx_cfg_t *cfg, nnx_weights_t weights,
-                            nnx_feature_t input, nnx_feature_t output);
-nnx_error_code nnx_conv_3x3_update_dims(nnx_cfg_t *cfg, int h_out, int w_out,
-                                        int k_out, int k_in);
-nnx_error_code nnx_conv_3x3_dw(nnx_cfg_t *cfg, nnx_weights_t weights,
-                               nnx_feature_t input, nnx_feature_t output);
-nnx_error_code nnx_conv_3x3_dw_update_dims(nnx_cfg_t *cfg, int h_out, int w_out,
-                                           int k_out, int k_in);
-
-#endif /* __NEUREKA_H__ */
diff --git a/neureka/inc/pulp_nnx_util.h b/neureka/inc/pulp_nnx_util.h
deleted file mode 100644
index f29ff3e..0000000
--- a/neureka/inc/pulp_nnx_util.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Luka Macan <luka.macan@unibo.it>
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef __PULP_NNX_UTIL__
-#define __PULP_NNX_UTIL__
-
-void nnx_activate_gvsoc_logging(int use_dec);
-void nnx_deactivate_gvsoc_logging();
-
-#endif /* __PULP_NNX_UTIL__ */
diff --git a/neureka/src/pulp_nnx_hal.c b/neureka/src/pulp_nnx_hal.c
deleted file mode 100644
index 1d99691..0000000
--- a/neureka/src/pulp_nnx_hal.c
+++ /dev/null
@@ -1,412 +0,0 @@
-/*
- * Luka Macan <luka.macan@unibo.it>
- * Arpan Prasad <prasadar@iis.ee.ethz.ch>
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#include "pulp_nnx_hal.h"
-#include "pmsis.h"
-
-static int qw, weight_d0_stride, outbytes;
-
-// TODO For all the following functions we use __builtin_pulp_OffsetedWrite and
-// __builtin_pulp_OffsetedRead instead of classic load/store because otherwise
-// the compiler is not able to correctly factorize the NEUREKA base in case
-// several accesses are done, ending up with twice more code
-
-// __builtin_pulp_OffsetedX not defined - needs further investigation... (too
-// old PULP toolchain? used v1.0.16) It is used inside PULP-SDK...
-
-int nnx_empty() { return !NEUREKA_READ(NEUREKA_STATUS); }
-
-int nnx_full() { return NEUREKA_READ(NEUREKA_STATUS) == NEUREKA_STATUS_FULL; }
-
-int nnx_job_id() { return NEUREKA_READ(NEUREKA_RUNNING_JOB); }
-
-void nnx_soft_clear() {
-  NEUREKA_WRITE(NEUREKA_SOFT_CLEAR, 0);
-  for (volatile int i = 0; i < 10; i++)
-    ;
-}
-
-int nnx_acquire() {
-  int job_id = -1;
-  NEUREKA_BARRIER_ACQUIRE(job_id);
-  return job_id;
-}
-
-void nnx_offload(nnx_task_t *task) {
-  int *task_data = (int *)task;
-  for (int i = 0; i < sizeof(nnx_task_t) / 4; ++i) {
-    NEUREKA_WRITE_IO_REG(i * 4, task_data[i]);
-  }
-}
-
-void nnx_offload_ptr(nnx_task_t *task) {
-  int *task_data = (int *)task;
-  for (int i = 0; i < 6; ++i) {
-    NEUREKA_WRITE_IO_REG(i * 4, task_data[i]);
-  }
-}
-
-void nnx_run_async() { NEUREKA_WRITE(NEUREKA_TRIGGER, 0); }
-
-void nnx_run_blocking() {
-  nnx_run_async();
-  nnx_wait_empty();
-}
-
-void nnx_commit() {
-  NEUREKA_WRITE(NEUREKA_TRIGGER, 1); // commit, no trigger
-}
-
-void nnx_busywait() { NEUREKA_BUSYWAIT(); }
-
-void nnx_wait_empty() {
-  while (!nnx_empty())
-    NEUREKA_BARRIER_NOSTATUS();
-}
-
-void nnx_wait_not_full() {
-  while (nnx_full())
-    NEUREKA_BARRIER_NOSTATUS();
-}
-
-void nnx_wait_on_id(const int id) {
-  while (nnx_job_id() <= id) {
-    eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0);
-  };
-}
-
-void nnx_task_init(nnx_task_t *task) { memset(task, 0, sizeof(nnx_task_t)); }
-
-int nnx_pad_input(nnx_cfg_t *cfg, const uint32_t top, const uint32_t right,
-                  const uint32_t bottom, const uint32_t left,
-                  const uint16_t value) {
-  uint32_t padding = 0;
-  uint32_t flags = 0;
-
-  if (top > MAX_PAD || right > MAX_PAD || bottom > MAX_PAD || left > MAX_PAD) {
-    return 1;
-  }
-
-  cfg->padding =
-      (top << 28) + (right << 24) + (bottom << 20) + (left << 16) + value;
-
-  return 0;
-}
-
-int nnx_norm_quant(nnx_cfg_t *cfg, const nnx_norm_t norm,
-                   const nnx_quant_t quant) {
-  if (quant.shift_amount > 31) {
-    printf("ERROR! quant.shift_amount > 31\n");
-    return 1;
-  }
-
-  if (quant.mode == quantMode16Bit) {
-    printf("ERROR! quant.mode == quantMode16Bit\n");
-    return 1;
-  }
-
-  BIT_SET(cfg->conf0, NEUREKA_FLAG_NORM_QUANT | quant.function | quant.mode |
-                          (quant.shift_amount << 16) |
-                          quant.flag_rounding << NEUREKA_SHIFT_ROUNDING |
-                          norm.mode |
-                          norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS |
-                          norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT);
-
-  return 0;
-}
-
-void nnx_mask_filter(nnx_cfg_t *cfg, const uint8_t top, const uint8_t right,
-                     const uint8_t bottom, const uint8_t left) {
-  cfg->filter_mask = ((uint32_t)top << 24) | ((uint32_t)right << 16) |
-                     ((uint32_t)bottom << 8) | ((uint32_t)left << 0);
-}
-
-nnx_error_code nnx_conv_1x1_update_dims(nnx_cfg_t *cfg, const int h_out,
-                                        const int w_out, const int k_out,
-                                        const int k_in) {
-
-  const int num_Ko = divnceil(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT);
-  const int num_Ki = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT);
-  const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE);
-  const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE);
-
-  const int rem_Ko = remainder(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT);
-  const int rem_Ki = remainder(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT);
-  const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE);
-  const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE);
-  const int rem_Hi = rem_Ho;
-  const int rem_Wi = rem_Wo;
-
-  const nnx_subtile_t subtile = {
-      .number = {.KoKi = concat_half(num_Ko, num_Ki),
-                 .HoWo = concat_half(num_Ho, num_Wo)},
-      .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
-                    .HoWo = concat_half(rem_Ho, rem_Wo),
-                    .HiWi = concat_half(rem_Hi, rem_Wi)}};
-  cfg->subtile = subtile;
-
-  // Strides
-  const nnx_stride_t input_stride = {
-      .d0 = k_in,
-      .d1 = k_in * w_out,
-      .d2 = k_in * 3 * 3 // copying arpan
-  };
-  cfg->input_stride = input_stride;
-
-  const nnx_stride_t output_stride = {
-      .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out};
-  cfg->output_stride = output_stride;
-
-  const nnx_stride_t weights_stride = {
-      .d0 = weight_d0_stride * qw,
-      .d1 = weight_d0_stride * qw * num_Ki,
-      .d2 = 0 // Unused
-  };
-  cfg->weights_stride = weights_stride;
-
-  return 0;
-}
-
-nnx_error_code nnx_conv_1x1(nnx_cfg_t *cfg, const nnx_weights_t weights,
-                            const nnx_feature_t input,
-                            const nnx_feature_t output) {
-  if (weights.bitwidth < 2 || weights.bitwidth > 8) {
-    return weightBitwidthOutOfBounds;
-  }
-
-  if (weights.offset_mode != weightOffsetModeLayerWise) {
-    // Currently only layer-wise mode is used.
-    return unsupportedWeightOffsetMode;
-  }
-
-  if ((input.bitwidth != featureBitwidth8Bit &&
-       input.bitwidth != featureBitwidth16Bit) ||
-      (output.bitwidth != featureBitwidth8Bit &&
-       output.bitwidth != featureBitwidth32Bit)) {
-    return unsupportedFeatureBitwidth;
-  }
-
-  if (input.height != output.height || input.width != output.width ||
-      input.depth != weights.depth || output.depth != weights.n_weights) {
-    return dimensionMismatch;
-  }
-
-  const int mode16 =
-      input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC;
-
-  BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_1x1 | mode16 |
-                          (weights.bitwidth - 1));
-
-  // Global static variables needed by update_dims
-  outbytes = output.bitwidth / 8;
-  weight_d0_stride =
-      mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8;
-  qw = weights.bitwidth;
-
-  nnx_conv_1x1_update_dims(cfg, output.height, output.width, output.depth,
-                           input.depth);
-
-  // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth);
-  cfg->weight_offset_factor = weights.offset_factor;
-
-  return 0;
-}
-
-nnx_error_code nnx_conv_3x3_update_dims(nnx_cfg_t *cfg, const int h_out,
-                                        const int w_out, const int k_out,
-                                        const int k_in) {
-
-  const int num_Ko = divnceil(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT);
-  const int num_Ki = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3);
-  const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE);
-  const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE);
-
-  const int rem_Ko = remainder(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT);
-  const int rem_Ki = remainder(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3);
-  const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE);
-  const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE);
-  const int rem_Hi = rem_Ho + 2;
-  const int rem_Wi = rem_Wo + 2;
-
-  const nnx_subtile_t subtile = {
-      .number = {.KoKi = concat_half(num_Ko, num_Ki),
-                 .HoWo = concat_half(num_Ho, num_Wo)},
-      .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
-                    .HoWo = concat_half(rem_Ho, rem_Wo),
-                    .HiWi = concat_half(rem_Hi, rem_Wi)}};
-  cfg->subtile = subtile;
-
-  // Strides
-  const nnx_stride_t input_stride = {.d0 = k_in,
-                                     .d1 = k_in * (w_out + 2),
-                                     .d2 = k_in * NEUREKA_FILTER_BUFFER_SIZE *
-                                           NEUREKA_FILTER_BUFFER_SIZE};
-  cfg->input_stride = input_stride;
-
-  const nnx_stride_t output_stride = {
-      .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out};
-  cfg->output_stride = output_stride;
-
-  const nnx_stride_t weights_stride = {
-      .d0 = NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3,
-      .d1 = NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 * qw * num_Ki,
-      .d2 = 0 // Unused
-  };
-  cfg->weights_stride = weights_stride;
-
-  return 0;
-}
-
-nnx_error_code nnx_conv_3x3(nnx_cfg_t *cfg, const nnx_weights_t weights,
-                            const nnx_feature_t input,
-                            const nnx_feature_t output) {
-  if (weights.bitwidth < 2 || weights.bitwidth > 8) {
-    return weightBitwidthOutOfBounds;
-  }
-
-  if (weights.offset_mode != weightOffsetModeLayerWise) {
-    // Currently only layer-wise mode is used.
-    return unsupportedWeightOffsetMode;
-  }
-
-  if ((input.bitwidth != featureBitwidth8Bit &&
-       input.bitwidth != featureBitwidth16Bit) ||
-      (output.bitwidth != featureBitwidth8Bit &&
-       output.bitwidth != featureBitwidth32Bit)) {
-    return unsupportedFeatureBitwidth;
-  }
-
-  if (input.height - 2 != output.height || input.width - 2 != output.width ||
-      input.depth != weights.depth || output.depth != weights.n_weights) {
-    return dimensionMismatch;
-  }
-
-  const int mode16 =
-      input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC;
-
-  BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_3x3 | mode16 |
-                          (weights.bitwidth - 1));
-
-  // Global static variables needed by update_dims
-  outbytes = output.bitwidth / 8;
-  weight_d0_stride =
-      mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8;
-  qw = weights.bitwidth;
-
-  nnx_conv_3x3_update_dims(cfg, output.height, output.width, output.depth,
-                           input.depth);
-
-  // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth);
-  cfg->weight_offset_factor = weights.offset_factor;
-
-  return 0;
-}
-
-nnx_error_code nnx_conv_3x3_dw_update_dims(nnx_cfg_t *cfg, const int h_out,
-                                           const int w_out, const int k_out,
-                                           const int k_in) {
-
-  const int num_Ko = divnceil(k_out, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3);
-  const int num_Ki = num_Ko;
-  const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE);
-  const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE);
-
-  const int rem_Ko = remainder(k_out, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3);
-  const int rem_Ki = rem_Ko;
-  const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE);
-  const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE);
-  const int rem_Hi = rem_Ho + 2;
-  const int rem_Wi = rem_Wo + 2;
-
-  const nnx_subtile_t subtile = {
-      .number = {.KoKi = concat_half(num_Ko, num_Ki),
-                 .HoWo = concat_half(num_Ho, num_Wo)},
-      .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
-                    .HoWo = concat_half(rem_Ho, rem_Wo),
-                    .HiWi = concat_half(rem_Hi, rem_Wi)}};
-  cfg->subtile = subtile;
-
-  // Strides
-  const nnx_stride_t input_stride = {
-      .d0 = k_out,
-      .d1 = k_out * (w_out + 2),
-      .d2 = 0 // Unused
-  };
-  cfg->input_stride = input_stride;
-
-  const nnx_stride_t output_stride = {
-      .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out};
-  cfg->output_stride = output_stride;
-
-  const nnx_stride_t weights_stride = {
-      .d0 = NEUREKA_FILTER_SIZE * NEUREKA_FILTER_SIZE * weight_d0_stride,
-      .d1 = 0,
-      .d2 = 0 // Unused
-  };
-  cfg->weights_stride = weights_stride;
-
-  return 0;
-}
-
-nnx_error_code nnx_conv_3x3_dw(nnx_cfg_t *cfg, const nnx_weights_t weights,
-                               const nnx_feature_t input,
-                               const nnx_feature_t output) {
-  if (weights.bitwidth < 2 || weights.bitwidth > 8) {
-    return weightBitwidthOutOfBounds;
-  }
-
-  if (weights.offset_mode != weightOffsetModeLayerWise) {
-    // Currently only layer-wise mode is used.
-    return unsupportedWeightOffsetMode;
-  }
-
-  if ((input.bitwidth != featureBitwidth8Bit &&
-       input.bitwidth != featureBitwidth16Bit) ||
-      (output.bitwidth != featureBitwidth8Bit &&
-       output.bitwidth != featureBitwidth32Bit)) {
-    return unsupportedFeatureBitwidth;
-  }
-
-  if (input.height - 2 != output.height || input.width - 2 != output.width ||
-      input.depth != output.depth) {
-    return dimensionMismatch;
-  }
-
-  const int mode16 =
-      input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC;
-
-  BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_3x3_DW | mode16 |
-                          (weights.bitwidth - 1));
-
-  // Global static variables needed by update_dims
-  outbytes = output.bitwidth / 8;
-  weight_d0_stride =
-      mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8;
-  qw = weights.bitwidth;
-
-  nnx_conv_3x3_dw_update_dims(cfg, output.height, output.width, output.depth,
-                              input.depth);
-
-  // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth);
-  cfg->weight_offset_factor = weights.offset_factor;
-
-  return 0;
-}
diff --git a/src/pulp_nnx_neureka.c b/src/pulp_nnx_neureka.c
new file mode 100644
index 0000000..1efb34f
--- /dev/null
+++ b/src/pulp_nnx_neureka.c
@@ -0,0 +1,131 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "pulp_nnx_neureka.h"
+#include "hwpe.h"
+#include "neureka.h"
+#include "pulp_nnx_util.h"
+#include <pmsis.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+void neureka_nnx_init(neureka_dev_t *dev, neureka_pulp_conf_t *conf) {
+  neureka_pulp_open(conf);
+  hwpe_soft_clear(&dev->hwpe_dev);
+}
+
+void neureka_nnx_term(neureka_dev_t *dev) {
+  hwpe_soft_clear(&dev->hwpe_dev);
+  neureka_pulp_close();
+}
+
+int neureka_nnx_dispatch_check(neureka_dev_t *dev) {
+  return !neureka_task_queue_full(dev);
+}
+
+void neureka_nnx_dispatch_wait(neureka_dev_t *dev) {
+  while (!neureka_nnx_dispatch_check(dev)) {
+    neureka_pulp_event_wait_and_clear();
+  }
+}
+
+int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task) {
+  if (hwpe_task_queue_acquire_task(&dev->hwpe_dev, &task->id)) {
+    return 1;
+  }
+  hwpe_task_queue_write_task(&dev->hwpe_dev, (uint32_t *)&task->data,
+                             (int)(sizeof(neureka_task_data_t) / 4));
+  hwpe_task_queue_release_and_run(&dev->hwpe_dev);
+  return 0;
+}
+
+int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task) {
+#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
+  // GVSOC model has a broken running_id so resolve_check
+  // conservativly looks if the task queue is empty.
+  return neureka_task_queue_empty(dev);
+#else
+  uint8_t prev_task_id = task->id - 1;
+  return !(hwpe_last_task_id(&dev->hwpe_dev) == prev_task_id ||
+           (hwpe_last_task_id(&dev->hwpe_dev) == task->id &&
+            !neureka_task_queue_empty(dev)));
+#endif
+}
+
+void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task) {
+  while (!neureka_nnx_resolve_check(dev, task)) {
+    neureka_pulp_event_wait_and_clear();
+  }
+}
+
+static inline uint32_t _get_tile_ptr(uint32_t ptr, int i, int j, int size_i,
+                                     uint32_t size_j, uint32_t size_k,
+                                     uint32_t stride_j, uint32_t stride_k,
+                                     uint32_t overlap_i, uint32_t overlap_j,
+                                     uint32_t offset_i, uint32_t offset_j,
+                                     uint8_t data_size) {
+  return ptr +
+         (i * (size_i - overlap_i) - offset_i) * stride_j * stride_k *
+             data_size / 8 +
+         (j * (size_j - overlap_j) - offset_j) * stride_k * data_size / 8;
+}
+
+void neureka_nnx_dispatch_stride2x2(
+    neureka_dev_t *dev, neureka_task_t *task, const uint32_t w_in, const uint32_t k_in,
+    const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint8_t h_ker, const uint8_t w_ker) {
+  const uint8_t stride = 2;
+  const uint8_t bits = 8;
+
+  const uint32_t n_h = divnceil(h_out, stride);
+  const uint32_t n_w = divnceil(w_out, stride);
+  const uint32_t input_height_offset = h_out % stride == 1 ? stride : 0;
+  const uint32_t input_width_offset = w_out % stride == 1 ? stride : 0;
+  const uint32_t output_height_offset = h_out % stride == 1 ? 1 : 0;
+  const uint32_t output_width_offset = w_out % stride == 1 ? 1 : 0;
+
+  const uint32_t input_base = task->data.infeat_ptr;
+  const uint32_t output_base = task->data.outfeat_ptr;
+  const uint32_t tile_padding = task->data.cfg.padding;
+
+  for (int i = 0; i < n_h; i++) {
+    for (int j = 0; j < n_w; j++) {
+      task->data.infeat_ptr =
+          _get_tile_ptr(input_base, i, j, 3 + h_ker - 1, 3 + w_ker - 1, k_in,
+                        w_in_stride, k_in_stride, h_ker - stride,
+                        w_ker - stride, i == 0 ? 0 : input_height_offset,
+                        j == 0 ? 0 : input_width_offset, bits);
+      task->data.outfeat_ptr =
+          _get_tile_ptr(output_base, i, j, 2, 2, k_out, w_out_stride,
+                        k_out_stride, 0, 0, i == 0 ? 0 : output_height_offset,
+                        j == 0 ? 0 : output_width_offset, bits);
+
+      task->data.cfg.padding =
+          neureka_get_tile_padding(tile_padding, i, j, n_h, n_w);
+
+      // Altered dispatch to wait if cannot acquire
+      while (neureka_nnx_dispatch(dev, task)) {
+        neureka_pulp_event_wait_and_clear();
+      }
+    }
+  }
+}