From c764efe5bb00304954a0560ba295920677563c2e Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Sun, 14 Jan 2024 18:14:19 +0100
Subject: [PATCH] Decouple pulp-sdk and add hwpe device structure (#1)

* Add hwpe interface and decouple pulp-sdk from hal

* Fixes and changes to test

Fixed/ignored some pyright bugs.
Made generated data files external which additionally required touching
of the nnx_layer.c file before compiling to have correct compilation but
still without `make clean`.
Added `-flto` flag to compilation.
Some formatting fixes.
---
 CHANGELOG.md                                  |  23 ++
 inc/pulp_nnx.h                                |  75 -------
 inc/pulp_nnx_ne16.h                           |  77 +++++++
 ne16/bsp/ne16_pulp_bsp.c                      |  85 ++++++++
 ne16/bsp/ne16_pulp_bsp.h                      |  81 +++++++
 .../{ne16_gvsoc_logging.h => ne16_gvsoc.h}    |  35 ++--
 ne16/hal/ne16.c                               |  39 ++++
 ne16/hal/ne16.h                               |  36 ++++
 ne16/hal/ne16_defs.h                          | 157 --------------
 ne16/hal/ne16_hal.h                           | 197 ------------------
 ne16/hal/{ne16_hal.c => ne16_task.c}          | 152 +++++++-------
 ne16/hal/ne16_task.h                          | 172 +++++++++++++++
 ne16/hal/ne16_task_defs.h                     | 107 ++++++++++
 src/pulp_nnx_ne16.c                           | 185 ++++------------
 test/.gitignore                               |   2 +-
 test/HeaderWriter.py                          |  52 ++++-
 test/Ne16TestClasses.py                       |  38 ++--
 test/TestClasses.py                           |   4 +-
 test/app/Makefile                             |  38 ++--
 test/app/src/main.c                           |  10 +-
 test/app/src/nnx_layer.c                      | 106 +++++-----
 test/conftest.py                              |   2 +-
 test/test.py                                  |   2 +
 util/hwpe.c                                   |  85 ++++++++
 util/hwpe.h                                   |  43 ++++
 25 files changed, 1044 insertions(+), 759 deletions(-)
 delete mode 100644 inc/pulp_nnx.h
 create mode 100644 inc/pulp_nnx_ne16.h
 create mode 100644 ne16/bsp/ne16_pulp_bsp.c
 create mode 100644 ne16/bsp/ne16_pulp_bsp.h
 rename ne16/gvsoc/{ne16_gvsoc_logging.h => ne16_gvsoc.h} (51%)
 create mode 100644 ne16/hal/ne16.c
 create mode 100644 ne16/hal/ne16.h
 delete mode 100644 ne16/hal/ne16_defs.h
 delete mode 100644 ne16/hal/ne16_hal.h
 rename ne16/hal/{ne16_hal.c => ne16_task.c} (61%)
 create mode 100644 ne16/hal/ne16_task.h
 create mode 100644 ne16/hal/ne16_task_defs.h
 create mode 100644 util/hwpe.c
 create mode 100644 util/hwpe.h
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 789566b..623a775 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,28 @@
 # Changelog
 
+## [Unreleased]
+
+### Added
+
+- New Hardware Processing Engine (HWPE) device in `util/hwpe.h`
+- A device structure for ne16 `ne16_dev_t` in `ne16/hal/ne16.h` which extends the hwpe device
+- Test app Makefile has now an `ACCELERATOR` variable to specify which accelerator is used
+
+### Changed
+
+- Library functions no longer start with a generic `nnx_` prefix but with `<accelerator>_nnx_` prefix
+  to allow for usage of multiple kinds of accelerators in the same system
+- Decoupled board specific functionality into `ne16/bsp` which also contains constant global structures
+  to the implementations of the `ne16_dev_t` structure
+- Moved all task related functions (`nnx_task_set_dims*`) into `ne16/hal/ne16_task.c`
+- Tests adjusted for the new interface
+- Test data generation moved into source files with extern declarations to check the output from the main
+
+### Fixed
+
+- pyright errors
+- formatting errors
+
 ## [0.2.1] - 2024-01-08
 
 ### Fixed
diff --git a/inc/pulp_nnx.h b/inc/pulp_nnx.h
deleted file mode 100644
index 312eaed..0000000
--- a/inc/pulp_nnx.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Luka Macan <luka.macan@unibo.it>
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef __PULP_NNX_H__
-#define __PULP_NNX_H__
-
-#include <stdint.h>
-
-typedef struct nnx_task_t nnx_task_t;
-typedef struct nnx_norm_t nnx_norm_t;
-typedef struct nnx_quant_t nnx_quant_t;
-typedef enum nnx_weight_offset_mode_e nnx_weight_offset_mode_e;
-
-void nnx_init(uint32_t max_stall);
-void nnx_term();
-int nnx_dispatch_check();
-void nnx_dispatch_check_blocking();
-void nnx_dispatch_task(nnx_task_t *task);
-int nnx_resolve_check(nnx_task_t *task);
-void nnx_resolve_check_blocking(nnx_task_t *task);
-
-void nnx_task_init(nnx_task_t *task, const uint8_t kernel_shape,
-                   const uint8_t depthwise, const uint8_t input_bits,
-                   const uint8_t output_bits, const uint8_t weights_bits,
-                   nnx_weight_offset_mode_e weights_offset_mode,
-                   const uint32_t weights_offset_factor, nnx_quant_t quant,
-                   nnx_norm_t norm, const uint8_t stride);
-uint32_t nnx_pad_ptr(uint32_t ptr, const uint32_t width, const uint32_t channel,
-                     const uint8_t bits, const uint8_t padding_top,
-                     const uint8_t padding_left);
-void nnx_task_set_ptrs(nnx_task_t *task, uint32_t input_ptr, uint32_t w_in,
-                       uint32_t k_in, uint8_t bits_in, uint8_t padding_top,
-                       uint8_t padding_left, uint32_t output_ptr,
-                       uint32_t weights_ptr, uint32_t scale_ptr,
-                       uint32_t shift_ptr, uint32_t bias_ptr);
-void nnx_task_set_dims(nnx_task_t *task, const uint32_t w_in,
-                       const uint32_t k_in, const uint32_t w_in_stride,
-                       const uint32_t k_in_stride, const uint32_t h_out,
-                       const uint32_t w_out, const uint32_t k_out,
-                       const uint32_t w_out_stride, const uint32_t k_out_stride,
-                       const uint8_t padding_top, const uint8_t padding_bottom,
-                       const uint8_t padding_right, const uint8_t padding_left);
-void nnx_task_set_dims_stride2x2(
-    nnx_task_t *task, const uint32_t h_in, const uint32_t w_in,
-    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
-    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
-    const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
-    const uint8_t padding_bottom, const uint8_t padding_right,
-    const uint8_t padding_left);
-void nnx_dispatch_task_stride2x2(
-    nnx_task_t *task, const uint32_t w_in, const uint32_t k_in,
-    const uint32_t w_in_stride, const uint32_t k_in_stride,
-    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
-    const uint8_t h_ker, const uint8_t w_ker);
-
-#endif // __PULP_NNX_H__
diff --git a/inc/pulp_nnx_ne16.h b/inc/pulp_nnx_ne16.h
new file mode 100644
index 0000000..eff9a60
--- /dev/null
+++ b/inc/pulp_nnx_ne16.h
@@ -0,0 +1,77 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "ne16.h"
+#include "ne16_pulp_bsp.h"
+#include "ne16_task.h"
+#include <stdint.h>
+
+/* PULP-NNX interface */
+
+void ne16_nnx_init(ne16_dev_t *dev, ne16_pulp_conf_t *conf);
+void ne16_nnx_term(ne16_dev_t *dev);
+
+/** ne16_nnx_dispatch_check
+ *
+ * Check whether you can dispatch to the accelerator.
+ */
+int ne16_nnx_dispatch_check(ne16_dev_t *dev);
+
+/** ne16_nnx_dispatch_wait
+ *
+ * Block until you can dispatch to the accelerator.
+ */
+void ne16_nnx_dispatch_wait(ne16_dev_t *dev);
+
+/** ne16_nnx_dispatch
+ *
+ * Dispatch a task to the accelerator.
+ * Fails with return code 1 if the task cannot be dispatched. Otherwise returns 0.
+ */
+int ne16_nnx_dispatch(ne16_dev_t *dev, ne16_task_t *task);
+
+/** ne16_nnx_resolve_check
+ *
+ * Check whether the task has been resolved.
+ */
+int ne16_nnx_resolve_check(ne16_dev_t *dev, ne16_task_t *task);
+
+/** ne16_nnx_resolve_wait
+ *
+ * Block until you can resolve the task.
+ */
+void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task);
+
+
+/* Additional helper functions */
+
+/** ne16_nnx_dispatch_stride2x2
+ *
+ * It uses NE16's 2x2 strided mode which reduces the number of writes NE16 does.
+ * This mode doesn't stride the NE16's subtile input pointer, so we have to
+ * tile the tile to the subtile's spatial dimensions (in this case 3x3 output).
+ * Works only if the k_out is divisible by 2.
+ */
+void ne16_nnx_dispatch_stride2x2(
+    ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in, const uint32_t k_in,
+    const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint8_t h_ker, const uint8_t w_ker);
diff --git a/ne16/bsp/ne16_pulp_bsp.c b/ne16/bsp/ne16_pulp_bsp.c
new file mode 100644
index 0000000..a170720
--- /dev/null
+++ b/ne16/bsp/ne16_pulp_bsp.c
@@ -0,0 +1,85 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "ne16_pulp_bsp.h"
+#include <pmsis.h>
+
+#define NE16_PULP_CLUSTER_CTRL_ADDR_BASE (0x00200000)
+#define NE16_PULP_CLUSTER_CTRL_HWPE_OFFS 0x18
+#define NE16_PULP_CLUSTER_CTRL_HWPE_ADDR                                       \
+  (NE16_PULP_CLUSTER_CTRL_ADDR_BASE + NE16_PULP_CLUSTER_CTRL_HWPE_OFFS)
+#define NE16_PULP_CLUSTER_CTRL_HWPE_MASK_CG_EN 0x800
+#define NE16_PULP_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO 0x100
+#define NE16_PULP_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL 0xff
+#define NE16_PULP_MAX_STALL (8)
+#define NE16_PULP_EVENT (1 << 12)
+#define NE16_PULP_BASE_ADDR (0x00201000)
+
+void ne16_pulp_cg_enable() {
+  *(volatile uint32_t *)NE16_PULP_CLUSTER_CTRL_HWPE_ADDR |=
+      NE16_PULP_CLUSTER_CTRL_HWPE_MASK_CG_EN;
+}
+
+void ne16_pulp_cg_disable() {
+  *(volatile uint32_t *)NE16_PULP_CLUSTER_CTRL_HWPE_ADDR &=
+      ~NE16_PULP_CLUSTER_CTRL_HWPE_MASK_CG_EN;
+}
+
+void ne16_pulp_hci_setpriority_ne16() {
+  *(volatile uint32_t *)NE16_PULP_CLUSTER_CTRL_HWPE_ADDR |=
+      NE16_PULP_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO;
+}
+
+void ne16_pulp_hci_setpriority_core() {
+  *(volatile uint32_t *)NE16_PULP_CLUSTER_CTRL_HWPE_ADDR &=
+      ~NE16_PULP_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO;
+}
+
+void ne16_pulp_hci_reset_max_stall() {
+  *(volatile uint32_t *)NE16_PULP_CLUSTER_CTRL_HWPE_ADDR &=
+      ~NE16_PULP_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL;
+}
+
+void ne16_pulp_hci_set_max_stall(uint32_t max_stall) {
+  *(volatile uint32_t *)NE16_PULP_CLUSTER_CTRL_HWPE_ADDR |=
+      max_stall & NE16_PULP_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL;
+}
+
+void ne16_pulp_open(ne16_pulp_conf_t *conf) {
+  ne16_pulp_cg_enable();
+  ne16_pulp_hci_setpriority_ne16();
+  ne16_pulp_hci_set_max_stall(conf->max_stall);
+}
+
+void ne16_pulp_close() {
+  ne16_pulp_hci_reset_max_stall();
+  ne16_pulp_hci_setpriority_core();
+  ne16_pulp_cg_disable();
+}
+
+void ne16_pulp_event_wait_and_clear() {
+  eu_evt_maskWaitAndClr(NE16_PULP_EVENT);
+}
+
+static const ne16_dev_t ne16_pulp_dev = {
+    .hwpe_dev = (struct hwpe_dev_t){
+        .base_addr = (volatile uint32_t *)NE16_PULP_BASE_ADDR}};
+
+const ne16_dev_t *ne16_pulp_get_dev() { return &ne16_pulp_dev; }
diff --git a/ne16/bsp/ne16_pulp_bsp.h b/ne16/bsp/ne16_pulp_bsp.h
new file mode 100644
index 0000000..8f1bc0a
--- /dev/null
+++ b/ne16/bsp/ne16_pulp_bsp.h
@@ -0,0 +1,81 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NE16_PULP_BSP_H__
+#define __NE16_PULP_BSP_H__
+
+#include "ne16.h"
+#include <stdint.h>
+
+/**
+ * ne16_pulp_cg_enable
+ *
+ * Enable clock gating of the NE16.
+ */
+void ne16_pulp_cg_enable();
+
+/**
+ * ne16_pulp_cg_enable
+ *
+ * Disable clock gating of the NE16.
+ */
+void ne16_pulp_cg_disable();
+
+/**
+ * ne16_pulp_setpriority_ne16
+ *
+ * Set HCI interconnect bus priority to prioritize NE16.
+ */
+void ne16_pulp_hci_setpriority_ne16();
+
+/**
+ * ne16_pulp_setpriority_core
+ *
+ * Set HCI bus priority to prioritize cores.
+ */
+void ne16_pulp_hci_setpriority_core();
+
+/**
+ * ne16_pulp_hci_reset_maxstall
+ *
+ * Reset the HCI bus maxstall parameter.
+ * TODO: Check if it disables it also or just resets?
+ */
+void ne16_pulp_hci_reset_max_stall();
+
+/**
+ * ne16_pulp_hci_set_maxstall
+ *
+ * Set the HCI bus maxstall. Maxstall defines how many cycles
+ * will the HCI bus stall the lower priority master, i.e. ne16 or core,
+ * before letting it do a transaction.
+ */
+void ne16_pulp_hci_set_max_stall(uint32_t max_stall);
+
+typedef struct ne16_pulp_conf_t {
+  int max_stall;
+} ne16_pulp_conf_t;
+
+void ne16_pulp_open(ne16_pulp_conf_t *conf);
+void ne16_pulp_close();
+void ne16_pulp_event_wait_and_clear();
+const ne16_dev_t *ne16_pulp_get_dev();
+
+#endif // !__NE16_PULP_BSP_H__
diff --git a/ne16/gvsoc/ne16_gvsoc_logging.h b/ne16/gvsoc/ne16_gvsoc.h
similarity index 51%
rename from ne16/gvsoc/ne16_gvsoc_logging.h
rename to ne16/gvsoc/ne16_gvsoc.h
index 19db8b5..f6626fd 100644
--- a/ne16/gvsoc/ne16_gvsoc_logging.h
+++ b/ne16/gvsoc/ne16_gvsoc.h
@@ -18,15 +18,19 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#ifndef __NE16_GVSOC_LOGGING_H__
-#define __NE16_GVSOC_LOGGING_H__
+#ifndef __NE16_GVSOC_H__
+#define __NE16_GVSOC_H__
 
-#include "ne16_hal.h"
+#include "ne16.h"
+#include "ne16_task.h"
 
-typedef enum ne16_gvsoc_logging_format_e {
-  NE16_GVSOC_LOGGING_FORMAT_DECIMAL = 0,
-  NE16_GVSOC_LOGGING_FORMAT_HEXADECIMAL = 3
-} ne16_gvsoc_logging_format_e;
+#define NE16_REG_GVSOC_LOG_LEVEL 24
+#define NE16_REG_GVSOC_LOG_FORMAT 25
+
+typedef enum ne16_gvsoc_log_format_e {
+  NE16_GVSOC_LOG_FORMAT_DECIMAL = 0,
+  NE16_GVSOC_LOG_FORMAT_HEXADECIMAL = 3
+} ne16_gvsoc_log_format_e;
 
 typedef enum ne16_gvsoc_log_level_e {
   NE16_GVSOC_LOG_LEVEL_CONFIG = 0,
@@ -35,15 +39,16 @@ typedef enum ne16_gvsoc_log_level_e {
   NE16_GVSOC_LOG_LEVEL_ALL = 3
 } ne16_gvsoc_log_level_e;
 
-static inline void
-ne16_activate_gvsoc_logging(ne16_gvsoc_log_level_e log_level,
-                            ne16_gvsoc_logging_format_e format) {
-  NE16_WRITE_IO_REG(sizeof(nnx_task_data_t), log_level);
-  NE16_WRITE_IO_REG(sizeof(nnx_task_data_t) + 4, format);
+static void ne16_gvsoc_log_activate(ne16_dev_t *dev,
+                                    ne16_gvsoc_log_level_e log_level,
+                                    ne16_gvsoc_log_format_e format) {
+  hwpe_task_reg_write(&dev->hwpe_dev, NE16_REG_GVSOC_LOG_LEVEL, log_level);
+  hwpe_task_reg_write(&dev->hwpe_dev, NE16_REG_GVSOC_LOG_FORMAT, format);
 }
 
-static inline void ne16_deactivate_gvsoc_logging() {
-  NE16_WRITE_IO_REG(sizeof(nnx_task_data_t), 0);
+static void ne16_gvsoc_log_deactivate(ne16_dev_t *dev) {
+  hwpe_task_reg_write(&dev->hwpe_dev, NE16_REG_GVSOC_LOG_LEVEL,
+                      NE16_GVSOC_LOG_LEVEL_CONFIG);
 }
 
-#endif // __NE16_GVSOC_LOGGING_H__
+#endif // __NE16_GVSOC_H__
diff --git a/ne16/hal/ne16.c b/ne16/hal/ne16.c
new file mode 100644
index 0000000..97859b4
--- /dev/null
+++ b/ne16/hal/ne16.c
@@ -0,0 +1,39 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "ne16.h"
+
+#define NE16_STATUS_EMPTY (0x000)
+#define NE16_STATUS_FULL (0x101)
+
+inline int ne16_task_queue_size(ne16_dev_t *dev) { return 2; }
+
+inline int ne16_task_queue_tasks_in_flight(ne16_dev_t *dev) {
+  uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev);
+  return (status & 0x1) + ((status >> 8) & 0x1);
+}
+
+inline int ne16_task_queue_empty(ne16_dev_t *dev) {
+  return hwpe_task_queue_status(&dev->hwpe_dev) == NE16_STATUS_EMPTY;
+}
+
+inline int ne16_task_queue_full(ne16_dev_t *dev) {
+  return hwpe_task_queue_status(&dev->hwpe_dev) == NE16_STATUS_FULL;
+}
diff --git a/ne16/hal/ne16.h b/ne16/hal/ne16.h
new file mode 100644
index 0000000..c4c3a19
--- /dev/null
+++ b/ne16/hal/ne16.h
@@ -0,0 +1,36 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NE16_H__
+#define __NE16_H__
+
+#include "hwpe.h"
+#include <stdint.h>
+
+typedef struct ne16_dev_t {
+  hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */
+} ne16_dev_t;
+
+int ne16_task_queue_size(ne16_dev_t *dev);
+int ne16_task_queue_tasks_in_flight(ne16_dev_t *dev);
+int ne16_task_queue_empty(ne16_dev_t *dev);
+int ne16_task_queue_full(ne16_dev_t *dev);
+
+#endif // __NE16_H__
diff --git a/ne16/hal/ne16_defs.h b/ne16/hal/ne16_defs.h
deleted file mode 100644
index 7aeb993..0000000
--- a/ne16/hal/ne16_defs.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Luka Macan <luka.macan@unibo.it>
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef __NE16_DEFS_H__
-#define __NE16_DEFS_H__
-
-/* ARHITECTURE */
-
-#define NE16_FILTER_SIZE (3)
-#define NE16_FILTER_BUFFER_SIZE (5)
-#define NE16_INPUT_CHANNEL_THROUGHPUT (16)
-#define NE16_OUTPUT_CHANNEL_THROUGHPUT (32)
-#define NE16_CONTEXT_SIZE (2)
-
-#define NE16_WEIGHT_D0_STRIDE_MODE8 (2)
-#define NE16_WEIGHT_D0_STRIDE_MODE16 (1)
-
-/* REGISTER MAP */
-
-#define NE16_EVT0 (1 << 12)
-#define NE16_EVT1 (1 << 13)
-#define NE16_BASE_ADDR (0x00201000)
-
-/* CLUSTER */
-
-#define CLUSTER_CTRL_ADDR_BASE (0x00200000)
-
-/* CLUSTER_HWPE */
-
-#define CLUSTER_CTRL_HWPE_OFFS 0x18
-
-#define CLUSTER_CTRL_HWPE_ADDR (CLUSTER_CTRL_ADDR_BASE + CLUSTER_CTRL_HWPE_OFFS)
-
-#define CLUSTER_CTRL_HWPE_MASK_CG_EN 0x800
-#define CLUSTER_CTRL_HWPE_MASK_HCI_PRIO 0x100
-#define CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL 0xff
-
-/* REGISTER OFFSETS */
-
-// commands
-#define NE16_TRIGGER 0x00
-#define NE16_ACQUIRE 0x04
-#define NE16_FINISHED 0x08
-#define NE16_STATUS 0x0C
-#define NE16_RUNNING_JOB 0x10
-#define NE16_SOFT_CLEAR 0x14
-#define NE16_SWSYNC 0x18
-#define NE16_URISCY_IMEM 0x1C
-
-// job configuration
-#define NE16_REGISTER_OFFSET 0x20
-
-#define NE16_REG_WEIGHTS_PTR 0x00
-#define NE16_REG_INFEAT_PTR 0x04
-#define NE16_REG_OUTFEAT_PTR 0x08
-#define NE16_REG_SCALE_PTR 0x0C
-#define NE16_REG_SCALE_SHIFT_PTR 0x10
-#define NE16_REG_SCALE_BIAS_PTR 0x14
-#define NE16_REG_INFEAT_D0_STRIDE 0x18
-#define NE16_REG_INFEAT_D1_STRIDE 0x1C
-#define NE16_REG_INFEAT_D2_STRIDE 0x20
-#define NE16_REG_OUTFEAT_D0_STRIDE 0x24
-#define NE16_REG_OUTFEAT_D1_STRIDE 0x28
-#define NE16_REG_OUTFEAT_D2_STRIDE 0x2C
-#define NE16_REG_WEIGHTS_D0_STRIDE 0x30
-#define NE16_REG_WEIGHTS_D1_STRIDE 0x34
-#define NE16_REG_WEIGHTS_D2_STRIDE 0x38
-#define NE16_REG_SUBTILE_REMAINDER_0 0x3C
-#define NE16_REG_SUBTILE_REMAINDER_1 0x40
-#define NE16_REG_SUBTILE_REMAINDER_2 0x44
-#define NE16_REG_SUBTILE_NUMBER_0 0x48
-#define NE16_REG_SUBTILE_NUMBER_1 0x4C
-#define NE16_REG_PADDING 0x50
-#define NE16_REG_WEIGHT_OFFSET_FACTOR 0x54
-#define NE16_REG_FILTER_MASKING 0x58
-#define NE16_REG_CONF0 0x5C
-
-/*  SHIFT  */
-
-#define NE16_SHIFT_FLAG_NORM_BIAS (25)
-#define NE16_SHIFT_FLAG_NORM_SHIFT (24)
-#define NE16_SHIFT_ROUNDING (11)
-
-/*  CONF0 FLAGS */
-
-#define NE16_FLAG_NORM_BIAS (1 << 25)
-#define NE16_FLAG_NORM_SHIFT (1 << 24)
-#define NE16_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23)
-#define NE16_FLAG_QUANT_FUNCTION_RELU (0 << 23)
-#define NE16_QUANT_MODE_8BIT (0 << 21)
-#define NE16_QUANT_MODE_16BIT (1 << 21)
-#define NE16_QUANT_MODE_32BIT (2 << 21)
-// conf0[20:16] - quantization shift amount
-#define NE16_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15)
-#define NE16_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15)
-#define NE16_FLAG_STREAMIN (1 << 14)
-#define NE16_NORM_MODE_8BIT (0 << 12)
-#define NE16_NORM_MODE_16BIT (1 << 12)
-#define NE16_NORM_MODE_32BIT (2 << 12)
-#define NE16_FLAG_ROUND (1 << 11)
-#define NE16_FLAG_STRIDE_2x2 (1 << 8)
-#define NE16_FLAG_LINEAR_MODE (1 << 7)
-#define NE16_FLAG_MODE_3x3 (0 << 5)
-#define NE16_FLAG_MODE_3x3_DW (1 << 5)
-#define NE16_FLAG_MODE_1x1 (2 << 5)
-#define NE16_FLAG_NORM_QUANT (1 << 4)
-#define NE16_FLAG_MODE_BASIC (0 << 3)
-#define NE16_FLAG_MODE16 (1 << 3)
-
-/* Masks */
-
-#define NE16_MASK_QUANT_FUNCTION (1 << 23)
-#define NE16_MASK_QUANT_MODE (3 << 21)
-
-/* PADDING */
-
-#define NE16_DONT_PAD (0)
-#define NE16_MAX_PAD (2)
-
-/* NORM */
-#define NE16_NORM_MAX_LEN (32)
-#define NE16_NO_NORM(length)                                                   \
-  {                                                                            \
-    .scale = scale_identity, .bias = NE16_NULL, .shift = NE16_NULL,            \
-    .length = length, .mode = normMode32Bit                                    \
-  }
-
-/* QUANT */
-#define NE16_NO_QUANT                                                          \
-  {                                                                            \
-    .shift_amount = 0, .mode = quantMode32Bit,                                 \
-    .function = quantFunctionIdentity                                          \
-  }
-
-/* NULL */
-#define NE16_NULL ((void *)0)
-
-#define NE16_STATUS_FULL (0x101)
-
-#endif // __NE16_DEFS_H__
diff --git a/ne16/hal/ne16_hal.h b/ne16/hal/ne16_hal.h
deleted file mode 100644
index 1bc460f..0000000
--- a/ne16/hal/ne16_hal.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Luka Macan <luka.macan@unibo.it>
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef __NE16_HAL_H__
-#define __NE16_HAL_H__
-
-#include "ne16_defs.h"
-#include <stdint.h>
-
-#define NE16_WRITE(offset, value)                                              \
-  *(int volatile *)(NE16_BASE_ADDR + (offset)) = (value)
-#define NE16_READ(offset) *(int volatile *)(NE16_BASE_ADDR + (offset))
-
-#define NE16_WRITE_IO_REG(offset, value)                                       \
-  NE16_WRITE(NE16_REGISTER_OFFSET + (offset), (value))
-#define NE16_READ_IO_REG(offset) NE16_READ(NE16_REGISTER_OFFSET + (offset))
-
-#define NE16_FLAG_USED (1)
-#define NE16_FLAG_UNUSED (0)
-
-typedef enum nnx_weight_offset_mode_e {
-  weightOffsetModeSymmetric = NE16_FLAG_WEIGHT_OFFSET_SYMMETRIC,
-  weightOffsetModeLayerWise = NE16_FLAG_WEIGHT_OFFSET_LAYER_WISE
-} nnx_weight_offset_mode_e;
-
-typedef enum {
-  normMode8Bit = NE16_NORM_MODE_8BIT,
-  normMode16Bit = NE16_NORM_MODE_16BIT,
-  normMode32Bit = NE16_NORM_MODE_32BIT
-} nnx_norm_mode_e;
-
-typedef struct nnx_norm_t {
-  nnx_norm_mode_e mode;
-  int flag_bias;
-  int flag_shift;
-} nnx_norm_t;
-
-typedef enum nnx_quant_mode_e {
-  quantMode8Bit = NE16_QUANT_MODE_8BIT,
-  quantMode16Bit = NE16_QUANT_MODE_16BIT,
-  quantMode32Bit = NE16_QUANT_MODE_32BIT
-} nnx_quant_mode_e;
-
-typedef enum nnx_quant_function_e {
-  quantFunctionIdentity = NE16_FLAG_QUANT_FUNCTION_IDENTITY,
-  quantFunctionRelu = NE16_FLAG_QUANT_FUNCTION_RELU
-} nnx_quant_function_e;
-
-typedef struct nnx_quant_t {
-  // Shift amount must be in range 0x00-0x1F
-  unsigned shift_amount;
-  nnx_quant_mode_e mode;
-  nnx_quant_function_e function;
-  int flag_rounding;
-} nnx_quant_t;
-
-typedef struct nnx_stride_t {
-  uint32_t d0;
-  uint32_t d1;
-  uint32_t d2;
-} nnx_stride_t;
-
-typedef struct nnx_subtile_remainder_t {
-  uint32_t KoKi;
-  uint32_t HoWo;
-  uint32_t HiWi;
-} nnx_subtile_remainder_t;
-
-typedef struct nnx_subtile_number_t {
-  uint32_t KoKi;
-  uint32_t HoWo;
-} nnx_subtile_number_t;
-
-typedef struct nnx_subtile_t {
-  nnx_subtile_remainder_t remainder;
-  nnx_subtile_number_t number;
-} nnx_subtile_t;
-
-typedef struct nnx_cfg_t {
-  nnx_stride_t input_stride;
-  nnx_stride_t output_stride;
-  nnx_stride_t weights_stride;
-  nnx_subtile_t subtile;
-  uint32_t padding;
-  uint32_t weight_offset_factor;
-  uint32_t filter_mask;
-  uint32_t conf0;
-} nnx_cfg_t;
-
-typedef struct nnx_task_data_t {
-  uint32_t weights_ptr;
-  uint32_t infeat_ptr;
-  uint32_t outfeat_ptr;
-  uint32_t scale_ptr;
-  uint32_t scale_shift_ptr;
-  uint32_t scale_bias_ptr;
-  nnx_cfg_t cfg;
-} nnx_task_data_t;
-
-typedef struct nnx_task_t {
-  nnx_task_data_t data;
-  uint8_t outbytes;
-  uint8_t weight_d0_stride;
-  uint8_t qw;
-  uint8_t stride_shift;
-  uint8_t output_channel_throughput;
-  uint8_t kernel_shape;
-  uint8_t depthwise;
-  uint8_t id;
-} nnx_task_t;
-
-void ne16_cg_enable();
-void ne16_cg_disable();
-
-/**
- * ne16_setpriority_ne16
- *
- * Set HCI interconnect bus priority to prioritize NE16.
- */
-void ne16_setpriority_ne16();
-
-/**
- * ne16_setpriority_core
- *
- * Set HCI bus priority to prioritize cores.
- */
-void ne16_setpriority_core();
-
-/**
- * ne16_reset_maxstall
- *
- * Reset the HCI bus maxstall parameter.
- * TODO: Check if it disables it also or just resets?
- */
-void ne16_reset_max_stall();
-
-/**
- * ne16_set_maxstall
- *
- * Set the HCI bus maxstall. Maxstall defines how many cycles
- * will the HCI bus stall the lower priority master, i.e. ne16 or core,
- * before letting it do a transaction.
- */
-void ne16_set_max_stall(uint32_t max_stall);
-void ne16_soft_clear();
-int ne16_empty();
-int ne16_full();
-uint8_t ne16_last_task_id();
-void ne16_event_wait();
-uint8_t ne16_acquire();
-void ne16_run_async();
-void ne16_commit();
-uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
-                               uint32_t i_width, uint32_t n_height,
-                               uint32_t n_width);
-
-void ne16_task_init(nnx_task_t *task, const uint8_t kernel_shape,
-                    const uint8_t depthwise, const uint8_t input_bits,
-                    const uint8_t output_bits, const uint8_t weights_bits,
-                    const nnx_weight_offset_mode_e weights_offset_mode,
-                    const uint32_t weights_offset_factor, nnx_quant_t quant,
-                    nnx_norm_t norm, const uint8_t stride);
-void ne16_task_set_strides(nnx_task_t *task, const uint32_t k_in,
-                           const uint32_t w_in_stride,
-                           const uint32_t k_in_stride,
-                           const uint32_t w_out_stride,
-                           const uint32_t k_out_stride);
-void ne16_task_set_counters(nnx_task_t *task, const uint32_t k_in,
-                            const uint32_t h_out, const uint32_t w_out,
-                            const uint32_t k_out, const uint8_t padding_bottom,
-                            const uint8_t padding_right);
-void ne16_task_set_padding(nnx_task_t *task, const uint8_t top,
-                           const uint8_t bottom, const uint8_t left,
-                           const uint8_t right, const uint8_t value);
-void ne16_task_set_mask_filter(nnx_task_t *task, const uint8_t top,
-                               const uint8_t right, const uint8_t bottom,
-                               const uint8_t left);
-void ne16_task_offload(nnx_task_t *task);
-
-#endif // __NE16_HAL_H__
diff --git a/ne16/hal/ne16_hal.c b/ne16/hal/ne16_task.c
similarity index 61%
rename from ne16/hal/ne16_hal.c
rename to ne16/hal/ne16_task.c
index 42b076c..0ba54d5 100644
--- a/ne16/hal/ne16_hal.c
+++ b/ne16/hal/ne16_task.c
@@ -18,63 +18,10 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include <stdint.h>
-
-#include "ne16_defs.h"
-#include "ne16_hal.h"
-#include "pmsis.h"
+#include "ne16_task.h"
+#include "ne16_task_defs.h"
 #include "pulp_nnx_util.h"
 
-inline void ne16_cg_enable() {
-  *(volatile uint32_t *)CLUSTER_CTRL_HWPE_ADDR |= CLUSTER_CTRL_HWPE_MASK_CG_EN;
-}
-
-inline void ne16_cg_disable() {
-  *(volatile uint32_t *)CLUSTER_CTRL_HWPE_ADDR &= ~CLUSTER_CTRL_HWPE_MASK_CG_EN;
-}
-
-inline void ne16_setpriority_ne16() {
-  *(volatile uint32_t *)CLUSTER_CTRL_HWPE_ADDR |=
-      CLUSTER_CTRL_HWPE_MASK_HCI_PRIO;
-}
-
-inline void ne16_setpriority_core() {
-  *(volatile uint32_t *)CLUSTER_CTRL_HWPE_ADDR &=
-      ~CLUSTER_CTRL_HWPE_MASK_HCI_PRIO;
-}
-
-inline void ne16_reset_max_stall() {
-  *(volatile uint32_t *)CLUSTER_CTRL_HWPE_ADDR &=
-      ~CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL;
-}
-
-inline void ne16_set_max_stall(uint32_t max_stall) {
-  *(volatile uint32_t *)CLUSTER_CTRL_HWPE_ADDR |=
-      max_stall & CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL;
-}
-
-inline void ne16_soft_clear() {
-  NE16_WRITE(NE16_SOFT_CLEAR, 0);
-  for (volatile int i = 0; i < 10; i++)
-    ;
-}
-
-inline int ne16_empty() { return NE16_READ(NE16_STATUS) == 0; }
-
-inline int ne16_full() { return NE16_READ(NE16_STATUS) == NE16_STATUS_FULL; }
-
-inline uint8_t ne16_last_task_id() { return NE16_READ(NE16_RUNNING_JOB); }
-
-inline void ne16_event_wait() { eu_evt_maskWaitAndClr(NE16_EVT0); }
-
-inline uint8_t ne16_acquire() { return NE16_READ(NE16_ACQUIRE); }
-
-inline void ne16_run_async() { NE16_WRITE(NE16_TRIGGER, 0); }
-
-inline void ne16_commit() {
-  NE16_WRITE(NE16_TRIGGER, 1); // commit, no trigger
-}
-
 inline uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
                                       uint32_t i_width, uint32_t n_height,
                                       uint32_t n_width) {
@@ -94,16 +41,16 @@ inline uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
   return tile_padding;
 }
 
-void ne16_task_init(nnx_task_t *task, const uint8_t kernel_shape,
+void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape,
                     const uint8_t depthwise, const uint8_t input_bits,
                     const uint8_t output_bits, const uint8_t weights_bits,
-                    const nnx_weight_offset_mode_e weights_offset_mode,
-                    const uint32_t weights_offset_factor, nnx_quant_t quant,
-                    nnx_norm_t norm, const uint8_t stride) {
+                    const ne16_weight_offset_mode_e weights_offset_mode,
+                    const uint32_t weights_offset_factor, ne16_quant_t quant,
+                    ne16_norm_t norm, const uint8_t stride) {
   const uint32_t flag_mode16 =
       input_bits == 16 ? NE16_FLAG_MODE16 : NE16_FLAG_MODE_BASIC;
 
-  *task = (nnx_task_t){
+  *task = (ne16_task_t){
       .outbytes = output_bits / 8,
       .weight_d0_stride = flag_mode16 ? NE16_WEIGHT_D0_STRIDE_MODE16
                                       : NE16_WEIGHT_D0_STRIDE_MODE8,
@@ -131,14 +78,42 @@ void ne16_task_init(nnx_task_t *task, const uint8_t kernel_shape,
   task->data.cfg.weight_offset_factor = weights_offset_factor;
 }
 
-void ne16_task_set_strides(nnx_task_t *task, const uint32_t k_in,
+/** ne16_pad_ptr
+ *
+ * Calculate the pointer to the start of the ptr as if
+ * it was the start to the padded data.
+ * Necessary for input pointer when it's padded.
+ */
+inline uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width,
+                             const uint32_t channel, const uint8_t bits,
+                             const uint8_t padding_top,
+                             const uint8_t padding_left) {
+  return ptr - (padding_top * width + padding_left) * channel * bits / 8;
+}
+
+inline void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr,
+                               uint32_t w_in, uint32_t k_in, uint8_t bits_in,
+                               uint8_t padding_top, uint8_t padding_left,
+                               uint32_t output_ptr, uint32_t weights_ptr,
+                               uint32_t scale_ptr, uint32_t shift_ptr,
+                               uint32_t bias_ptr) {
+  task->data.infeat_ptr =
+      ne16_pad_ptr(input_ptr, w_in, k_in, bits_in, padding_top, padding_left);
+  task->data.outfeat_ptr = output_ptr;
+  task->data.weights_ptr = weights_ptr;
+  task->data.scale_ptr = scale_ptr;
+  task->data.scale_shift_ptr = shift_ptr;
+  task->data.scale_bias_ptr = bias_ptr;
+}
+
+void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in,
                            const uint32_t w_in_stride,
                            const uint32_t k_in_stride,
                            const uint32_t w_out_stride,
                            const uint32_t k_out_stride) {
   const uint32_t num_k_in = divnceil(k_in, NE16_INPUT_CHANNEL_THROUGHPUT);
 
-  const nnx_stride_t input_stride = {
+  const ne16_stride_t input_stride = {
       .d0 = k_in_stride,
       .d1 = k_in_stride * w_in_stride,
       .d2 = task->depthwise ? 0
@@ -147,7 +122,7 @@ void ne16_task_set_strides(nnx_task_t *task, const uint32_t k_in,
   task->data.cfg.input_stride = input_stride;
 
   // WARNING: Stride works only for even output channel sizes (divisible by 2)
-  const nnx_stride_t output_stride = {
+  const ne16_stride_t output_stride = {
       .d0 = 32,
       .d1 = (k_out_stride * task->outbytes) >> task->stride_shift,
       .d2 =
@@ -174,7 +149,7 @@ void ne16_task_set_strides(nnx_task_t *task, const uint32_t k_in,
   }
 }
 
-void ne16_task_set_counters(nnx_task_t *task, const uint32_t k_in,
+void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in,
                             const uint32_t h_out, const uint32_t w_out,
                             const uint32_t k_out, const uint8_t padding_bottom,
                             const uint8_t padding_right) {
@@ -192,7 +167,7 @@ void ne16_task_set_counters(nnx_task_t *task, const uint32_t k_in,
   const uint16_t rem_Wi =
       (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right;
 
-  const nnx_subtile_t subtile = {
+  const ne16_subtile_t subtile = {
       .number = {.KoKi = concat_half(num_Ko, num_Ki),
                  .HoWo = concat_half(num_Ho, num_Wo)},
       .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
@@ -201,7 +176,7 @@ void ne16_task_set_counters(nnx_task_t *task, const uint32_t k_in,
   task->data.cfg.subtile = subtile;
 }
 
-inline void ne16_task_set_padding(nnx_task_t *task, const uint8_t top,
+inline void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
                                   const uint8_t bottom, const uint8_t left,
                                   const uint8_t right, const uint8_t value) {
   task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) |
@@ -209,16 +184,49 @@ inline void ne16_task_set_padding(nnx_task_t *task, const uint8_t top,
                            (value & 0xff);
 }
 
-inline void ne16_task_set_mask_filter(nnx_task_t *task, const uint8_t top,
+inline void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
                                       const uint8_t right, const uint8_t bottom,
                                       const uint8_t left) {
   task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
                                ((bottom & 0xff) << 8) | ((left & 0xff) << 0);
 }
 
-inline void ne16_task_offload(nnx_task_t *task) {
-  uint32_t *task_data = (uint32_t *)&task->data;
-  for (int i = 0; i < sizeof(nnx_task_data_t) / 4; ++i) {
-    NE16_WRITE_IO_REG(i * 4, task_data[i]);
-  }
+void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
+                        const uint32_t k_in, const uint32_t w_in_stride,
+                        const uint32_t k_in_stride, const uint32_t h_out,
+                        const uint32_t w_out, const uint32_t k_out,
+                        const uint32_t w_out_stride, const uint32_t k_out_stride,
+                        const uint8_t padding_top, const uint8_t padding_bottom,
+                        const uint8_t padding_right,
+                        const uint8_t padding_left) {
+  ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
+                        k_out_stride);
+  ne16_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,
+                         padding_right);
+  ne16_task_set_padding(task, padding_top, padding_bottom, padding_left,
+                        padding_right, 0);
+}
+
+void ne16_task_set_dims_stride2x2(
+    ne16_task_t *task, const uint32_t h_in, const uint32_t w_in,
+    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
+    const uint8_t padding_bottom, const uint8_t padding_right,
+    const uint8_t padding_left) {
+  const uint8_t stride = 2;
+
+  ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
+                        k_out_stride);
+  ne16_task_set_counters(task, k_in, h_out > 1 ? 3 : 1, w_out > 1 ? 3 : 1,
+                         k_out, h_in + padding_top >= 5 ? 0 : padding_bottom, 0);
+
+  const uint8_t padding_bottom_new =
+      (h_in + padding_top - h_ker) % stride == 0 ? 0 : padding_bottom;
+  const uint8_t padding_right_new =
+      (w_in + padding_left - w_ker) % stride == 0 ? 0 : padding_right;
+
+  ne16_task_set_padding(task, padding_top, padding_bottom_new, padding_left,
+                        padding_right_new, 0);
 }
diff --git a/ne16/hal/ne16_task.h b/ne16/hal/ne16_task.h
new file mode 100644
index 0000000..df16b6c
--- /dev/null
+++ b/ne16/hal/ne16_task.h
@@ -0,0 +1,172 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NE16_TASK_H__
+#define __NE16_TASK_H__
+
+#include "ne16_task_defs.h"
+#include <stdint.h>
+
+typedef enum ne16_task_flag_e {
+  ne16TaskFlagFalse = 0,
+  ne16TaskFlagTrue = 1
+} ne16_task_flag_e;
+
+typedef enum ne16_weight_offset_mode_e {
+  weightOffsetModeSymmetric = NE16_FLAG_WEIGHT_OFFSET_SYMMETRIC,
+  weightOffsetModeLayerWise = NE16_FLAG_WEIGHT_OFFSET_LAYER_WISE
+} ne16_weight_offset_mode_e;
+
+typedef enum {
+  normMode8Bit = NE16_NORM_MODE_8BIT,
+  normMode16Bit = NE16_NORM_MODE_16BIT,
+  normMode32Bit = NE16_NORM_MODE_32BIT
+} ne16_norm_mode_e;
+
+typedef struct ne16_norm_t {
+  ne16_norm_mode_e mode;
+  int flag_bias;
+  int flag_shift;
+} ne16_norm_t;
+
+typedef enum ne16_quant_mode_e {
+  quantMode8Bit = NE16_QUANT_MODE_8BIT,
+  quantMode16Bit = NE16_QUANT_MODE_16BIT,
+  quantMode32Bit = NE16_QUANT_MODE_32BIT
+} ne16_quant_mode_e;
+
+typedef enum ne16_quant_function_e {
+  quantFunctionIdentity = NE16_FLAG_QUANT_FUNCTION_IDENTITY,
+  quantFunctionRelu = NE16_FLAG_QUANT_FUNCTION_RELU
+} ne16_quant_function_e;
+
+typedef struct ne16_quant_t {
+  // Shift amount must be in range 0x00-0x1F
+  unsigned shift_amount;
+  ne16_quant_mode_e mode;
+  ne16_quant_function_e function;
+  int flag_rounding;
+} ne16_quant_t;
+
+typedef struct ne16_stride_t {
+  uint32_t d0;
+  uint32_t d1;
+  uint32_t d2;
+} ne16_stride_t;
+
+typedef struct ne16_subtile_remainder_t {
+  uint32_t KoKi;
+  uint32_t HoWo;
+  uint32_t HiWi;
+} ne16_subtile_remainder_t;
+
+typedef struct ne16_subtile_number_t {
+  uint32_t KoKi;
+  uint32_t HoWo;
+} ne16_subtile_number_t;
+
+typedef struct ne16_subtile_t {
+  ne16_subtile_remainder_t remainder;
+  ne16_subtile_number_t number;
+} ne16_subtile_t;
+
+typedef struct ne16_cfg_t {
+  ne16_stride_t input_stride;
+  ne16_stride_t output_stride;
+  ne16_stride_t weights_stride;
+  ne16_subtile_t subtile;
+  uint32_t padding;
+  uint32_t weight_offset_factor;
+  uint32_t filter_mask;
+  uint32_t conf0;
+} ne16_cfg_t;
+
+typedef struct ne16_task_data_t {
+  uint32_t weights_ptr;
+  uint32_t infeat_ptr;
+  uint32_t outfeat_ptr;
+  uint32_t scale_ptr;
+  uint32_t scale_shift_ptr;
+  uint32_t scale_bias_ptr;
+  ne16_cfg_t cfg;
+} ne16_task_data_t;
+
+typedef struct ne16_task_t {
+  ne16_task_data_t data;
+  uint8_t outbytes;
+  uint8_t weight_d0_stride;
+  uint8_t qw;
+  uint8_t stride_shift;
+  uint8_t output_channel_throughput;
+  uint8_t kernel_shape;
+  uint8_t depthwise;
+  uint8_t id;
+} ne16_task_t;
+
+void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape,
+                    const uint8_t depthwise, const uint8_t input_bits,
+                    const uint8_t output_bits, const uint8_t weights_bits,
+                    const ne16_weight_offset_mode_e weights_offset_mode,
+                    const uint32_t weights_offset_factor, ne16_quant_t quant,
+                    ne16_norm_t norm, const uint8_t stride);
+uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
+                               uint32_t i_width, uint32_t n_height,
+                               uint32_t n_width);
+uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width,
+                      const uint32_t channel, const uint8_t bits,
+                      const uint8_t padding_top, const uint8_t padding_left);
+void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in,
+                        uint32_t k_in, uint8_t bits_in, uint8_t padding_top,
+                        uint8_t padding_left, uint32_t output_ptr,
+                        uint32_t weights_ptr, uint32_t scale_ptr,
+                        uint32_t shift_ptr, uint32_t bias_ptr);
+void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in,
+                           const uint32_t w_in_stride,
+                           const uint32_t k_in_stride,
+                           const uint32_t w_out_stride,
+                           const uint32_t k_out_stride);
+void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in,
+                            const uint32_t h_out, const uint32_t w_out,
+                            const uint32_t k_out, const uint8_t padding_bottom,
+                            const uint8_t padding_right);
+void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
+                           const uint8_t bottom, const uint8_t left,
+                           const uint8_t right, const uint8_t value);
+void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
+                               const uint8_t right, const uint8_t bottom,
+                               const uint8_t left);
+void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
+                        const uint32_t k_in, const uint32_t w_in_stride,
+                        const uint32_t k_in_stride, const uint32_t h_out,
+                        const uint32_t w_out, const uint32_t k_out,
+                        const uint32_t w_out_stride, const uint32_t k_out_stride,
+                        const uint8_t padding_top, const uint8_t padding_bottom,
+                        const uint8_t padding_right,
+                        const uint8_t padding_left);
+void ne16_task_set_dims_stride2x2(
+    ne16_task_t *task, const uint32_t h_in, const uint32_t w_in,
+    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
+    const uint8_t padding_bottom, const uint8_t padding_right,
+    const uint8_t padding_left);
+
+#endif // !__NE16_TASK_H__
diff --git a/ne16/hal/ne16_task_defs.h b/ne16/hal/ne16_task_defs.h
new file mode 100644
index 0000000..803e30e
--- /dev/null
+++ b/ne16/hal/ne16_task_defs.h
@@ -0,0 +1,107 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NE16_DEFS_H__
+#define __NE16_DEFS_H__
+
+/* ARHITECTURE */
+
+#define NE16_FILTER_SIZE (3)
+#define NE16_FILTER_BUFFER_SIZE (5)
+#define NE16_INPUT_CHANNEL_THROUGHPUT (16)
+#define NE16_OUTPUT_CHANNEL_THROUGHPUT (32)
+
+#define NE16_WEIGHT_D0_STRIDE_MODE8 (2)
+#define NE16_WEIGHT_D0_STRIDE_MODE16 (1)
+
+/* TASK REGISTERS */
+
+// job configuration
+#define NE16_REG_WEIGHTS_PTR 0
+#define NE16_REG_INFEAT_PTR 1
+#define NE16_REG_OUTFEAT_PTR 2
+#define NE16_REG_SCALE_PTR 3
+#define NE16_REG_SCALE_SHIFT_PTR 4
+#define NE16_REG_SCALE_BIAS_PTR 5
+#define NE16_REG_INFEAT_D0_STRIDE 6
+#define NE16_REG_INFEAT_D1_STRIDE 7
+#define NE16_REG_INFEAT_D2_STRIDE 8
+#define NE16_REG_OUTFEAT_D0_STRIDE 9
+#define NE16_REG_OUTFEAT_D1_STRIDE 10
+#define NE16_REG_OUTFEAT_D2_STRIDE 11
+#define NE16_REG_WEIGHTS_D0_STRIDE 12
+#define NE16_REG_WEIGHTS_D1_STRIDE 13
+#define NE16_REG_WEIGHTS_D2_STRIDE 14
+#define NE16_REG_SUBTILE_REMAINDER_0 15
+#define NE16_REG_SUBTILE_REMAINDER_1 16
+#define NE16_REG_SUBTILE_REMAINDER_2 17
+#define NE16_REG_SUBTILE_NUMBER_0 18
+#define NE16_REG_SUBTILE_NUMBER_1 19
+#define NE16_REG_PADDING 20
+#define NE16_REG_WEIGHT_OFFSET_FACTOR 21
+#define NE16_REG_FILTER_MASKING 22
+#define NE16_REG_CONF0 23
+
+/*  SHIFT  */
+
+#define NE16_SHIFT_FLAG_NORM_BIAS (25)
+#define NE16_SHIFT_FLAG_NORM_SHIFT (24)
+#define NE16_SHIFT_ROUNDING (11)
+
+/*  CONF0 FLAGS */
+
+#define NE16_FLAG_NORM_BIAS (1 << 25)
+#define NE16_FLAG_NORM_SHIFT (1 << 24)
+#define NE16_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23)
+#define NE16_FLAG_QUANT_FUNCTION_RELU (0 << 23)
+#define NE16_QUANT_MODE_8BIT (0 << 21)
+#define NE16_QUANT_MODE_16BIT (1 << 21)
+#define NE16_QUANT_MODE_32BIT (2 << 21)
+// conf0[20:16] - quantization shift amount
+#define NE16_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15)
+#define NE16_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15)
+#define NE16_FLAG_STREAMIN (1 << 14)
+#define NE16_NORM_MODE_8BIT (0 << 12)
+#define NE16_NORM_MODE_16BIT (1 << 12)
+#define NE16_NORM_MODE_32BIT (2 << 12)
+#define NE16_FLAG_ROUND (1 << 11)
+#define NE16_FLAG_STRIDE_2x2 (1 << 8)
+#define NE16_FLAG_LINEAR_MODE (1 << 7)
+#define NE16_FLAG_MODE_3x3 (0 << 5)
+#define NE16_FLAG_MODE_3x3_DW (1 << 5)
+#define NE16_FLAG_MODE_1x1 (2 << 5)
+#define NE16_FLAG_NORM_QUANT (1 << 4)
+#define NE16_FLAG_MODE_BASIC (0 << 3)
+#define NE16_FLAG_MODE16 (1 << 3)
+
+/* Masks */
+
+#define NE16_MASK_QUANT_FUNCTION (1 << 23)
+#define NE16_MASK_QUANT_MODE (3 << 21)
+
+/* PADDING */
+
+#define NE16_DONT_PAD (0)
+#define NE16_MAX_PAD (2)
+
+/* NORM */
+#define NE16_NORM_MAX_LEN (32)
+
+#endif // __NE16_DEFS_H__
diff --git a/src/pulp_nnx_ne16.c b/src/pulp_nnx_ne16.c
index 80752e8..7ab0e99 100644
--- a/src/pulp_nnx_ne16.c
+++ b/src/pulp_nnx_ne16.c
@@ -18,155 +18,63 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "ne16_hal.h"
-#include "pmsis.h"
-#include "pulp_nnx.h"
+#include "pulp_nnx_ne16.h"
+#include "hwpe.h"
+#include "ne16.h"
 #include "pulp_nnx_util.h"
+#include <pmsis.h>
 #include <stdint.h>
+#include <sys/types.h>
 
-inline void nnx_init(uint32_t max_stall) {
-  ne16_cg_enable();
-  ne16_setpriority_ne16();
-  ne16_set_max_stall(max_stall);
-  ne16_soft_clear();
+void ne16_nnx_init(ne16_dev_t *dev, ne16_pulp_conf_t *conf) {
+  ne16_pulp_open(conf);
+  hwpe_soft_clear(&dev->hwpe_dev);
 }
 
-inline void nnx_term() {
-  ne16_soft_clear();
-  ne16_setpriority_core();
-  ne16_reset_max_stall();
-  ne16_cg_disable();
+void ne16_nnx_term(ne16_dev_t *dev) {
+  hwpe_soft_clear(&dev->hwpe_dev);
+  ne16_pulp_close();
 }
 
-/** nnx_dispatch_check
- *
- * Check whether you can dispatch to the accelerator.
- */
-inline int nnx_dispatch_check() { return !ne16_full(); }
+int ne16_nnx_dispatch_check(ne16_dev_t *dev) {
+  return !ne16_task_queue_full(dev);
+}
 
-/** nnx_dispatch_check_blocking
- *
- * Block until you can dispatch to the accelerator.
- */
-inline void nnx_dispatch_check_blocking() {
-  while (!nnx_dispatch_check()) {
-    ne16_event_wait();
+void ne16_nnx_dispatch_wait(ne16_dev_t *dev) {
+  while (!ne16_nnx_dispatch_check(dev)) {
+    ne16_pulp_event_wait_and_clear();
   }
 }
 
-/** nnx_dispatch_task
- *
- * Dispatch a task to the accelerator, assuming it
- * was checked before.
- */
-inline void nnx_dispatch_task(nnx_task_t *task) {
-  task->id = ne16_acquire();
-  ne16_task_offload(task);
-  ne16_run_async();
+int ne16_nnx_dispatch(ne16_dev_t *dev, ne16_task_t *task) {
+  if (hwpe_task_queue_acquire_task(&dev->hwpe_dev, &task->id)) {
+    return 1;
+  }
+  hwpe_task_queue_write_task(&dev->hwpe_dev, (uint32_t *)&task->data,
+                             (int)(sizeof(ne16_task_data_t) / 4));
+  hwpe_task_queue_release_and_run(&dev->hwpe_dev);
+  return 0;
 }
 
-/** nnx_resolve_check
- *
- * Check whether the task has been resolved.
- */
-inline int nnx_resolve_check(nnx_task_t *task) {
+int ne16_nnx_resolve_check(ne16_dev_t *dev, ne16_task_t *task) {
+#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
+  // GVSOC model has a broken running_id so resolve_check
+  // conservativly looks if the task queue is empty.
+  return ne16_task_queue_empty(dev);
+#else
   uint8_t prev_task_id = task->id - 1;
-  return !(ne16_last_task_id() == prev_task_id ||
-           (ne16_last_task_id() == task->id && !ne16_empty()));
+  return !(hwpe_last_task_id(&dev->hwpe_dev) == prev_task_id ||
+           (hwpe_last_task_id(&dev->hwpe_dev) == task->id &&
+            !ne16_task_queue_empty(dev)));
+#endif
 }
 
-/** nnx_resolve_check_blocking
- *
- * Block until you can resolve the task.
- */
-inline void nnx_resolve_check_blocking(nnx_task_t *task) {
-  while (!nnx_resolve_check(task)) {
-    ne16_event_wait();
+void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task) {
+  while (!ne16_nnx_resolve_check(dev, task)) {
+    ne16_pulp_event_wait_and_clear();
   }
 }
 
-inline void nnx_task_init(nnx_task_t *task, const uint8_t kernel_shape,
-                          const uint8_t depthwise, const uint8_t input_bits,
-                          const uint8_t output_bits, const uint8_t weights_bits,
-                          nnx_weight_offset_mode_e weights_offset_mode,
-                          const uint32_t weights_offset_factor,
-                          nnx_quant_t quant, nnx_norm_t norm,
-                          const uint8_t stride) {
-
-  ne16_task_init(task, kernel_shape, depthwise, input_bits, output_bits,
-                 weights_bits, weights_offset_mode, weights_offset_factor,
-                 quant, norm, stride);
-}
-
-/** nnx_pad_ptr
- *
- * Calculate the pointer to the start of the ptr as if
- * it was the start to the padded data.
- * Necessary for input pointer when it's padded.
- */
-inline uint32_t nnx_pad_ptr(uint32_t ptr, const uint32_t width,
-                            const uint32_t channel, const uint8_t bits,
-                            const uint8_t padding_top,
-                            const uint8_t padding_left) {
-  return ptr - (padding_top * width + padding_left) * channel * bits / 8;
-}
-
-inline void nnx_task_set_ptrs(nnx_task_t *task, uint32_t input_ptr,
-                              uint32_t w_in, uint32_t k_in, uint8_t bits_in,
-                              uint8_t padding_top, uint8_t padding_left,
-                              uint32_t output_ptr, uint32_t weights_ptr,
-                              uint32_t scale_ptr, uint32_t shift_ptr,
-                              uint32_t bias_ptr) {
-  task->data.infeat_ptr =
-      nnx_pad_ptr(input_ptr, w_in, k_in, bits_in, padding_top, padding_left);
-  task->data.outfeat_ptr = output_ptr;
-  task->data.weights_ptr = weights_ptr;
-  task->data.scale_ptr = scale_ptr;
-  task->data.scale_shift_ptr = shift_ptr;
-  task->data.scale_bias_ptr = bias_ptr;
-}
-
-void nnx_task_set_dims(nnx_task_t *task, const uint32_t w_in,
-                       const uint32_t k_in, const uint32_t w_in_stride,
-                       const uint32_t k_in_stride, const uint32_t h_out,
-                       const uint32_t w_out, const uint32_t k_out,
-                       const uint32_t w_out_stride, const uint32_t k_out_stride,
-                       const uint8_t padding_top, const uint8_t padding_bottom,
-                       const uint8_t padding_right,
-                       const uint8_t padding_left) {
-  ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
-                        k_out_stride);
-  ne16_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,
-                         padding_right);
-  ne16_task_set_padding(task, padding_top, padding_bottom, padding_left,
-                        padding_right, 0);
-}
-
-void nnx_task_set_dims_stride2x2(
-    nnx_task_t *task, const uint32_t h_in, const uint32_t w_in,
-    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
-    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
-    const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
-    const uint8_t padding_bottom, const uint8_t padding_right,
-    const uint8_t padding_left) {
-  const uint8_t stride = 2;
-
-  ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
-                        k_out_stride);
-  ne16_task_set_counters(task, k_in, h_out > 1 ? 3 : 1, w_out > 1 ? 3 : 1,
-                         k_out, h_in + padding_top >= 5 ? 0 : padding_bottom,
-                         0/*w_out > 2 ? 0 : padding_right*/);
-
-  const uint8_t padding_bottom_new =
-      (h_in + padding_top - h_ker) % stride == 0 ? 0 : padding_bottom;
-  const uint8_t padding_right_new =
-      (w_in + padding_left - w_ker) % stride == 0 ? 0 : padding_right;
-
-  ne16_task_set_padding(task, padding_top, padding_bottom_new, padding_left,
-                        padding_right_new, 0);
-}
-
 static inline uint32_t _get_tile_ptr(uint32_t ptr, int i, int j, int size_i,
                                      uint32_t size_j, uint32_t size_k,
                                      uint32_t stride_j, uint32_t stride_k,
@@ -179,15 +87,8 @@ static inline uint32_t _get_tile_ptr(uint32_t ptr, int i, int j, int size_i,
          (j * (size_j - overlap_j) - offset_j) * stride_k * data_size / 8;
 }
 
-/** nnx_dispatch_task_stride2x2
- *
- * It uses NE16's 2x2 strided mode which reduces the number of writes NE16 does.
- * This mode doesn't stride the NE16's subtile input pointer, so we have to
- * tile the tile to the subtile's spatial dimensions (in this case 3x3 output).
- * Works only if the k_out is divisible by 2.
- */
-void nnx_dispatch_task_stride2x2(
-    nnx_task_t *task, const uint32_t w_in, const uint32_t k_in,
+void ne16_nnx_dispatch_stride2x2(
+    ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in, const uint32_t k_in,
     const uint32_t w_in_stride, const uint32_t k_in_stride,
     const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
     const uint32_t w_out_stride, const uint32_t k_out_stride,
@@ -221,8 +122,10 @@ void nnx_dispatch_task_stride2x2(
       task->data.cfg.padding =
           ne16_get_tile_padding(tile_padding, i, j, n_h, n_w);
 
-      nnx_dispatch_check_blocking();
-      nnx_dispatch_task(task);
+      // Altered dispatch to wait if cannot acquire
+      while (ne16_nnx_dispatch(dev, task)) {
+        ne16_pulp_event_wait_and_clear();
+      }
     }
   }
 }
diff --git a/test/.gitignore b/test/.gitignore
index 3fc5b6a..50e5358 100644
--- a/test/.gitignore
+++ b/test/.gitignore
@@ -2,7 +2,7 @@ BUILD
 __pycache__
 .cache
 .pytest_cache
-app/gen_inc
+app/gen
 **/compile_commands.json
 **/*.log
 **/*.pt
diff --git a/test/HeaderWriter.py b/test/HeaderWriter.py
index eff0d01..5abb204 100644
--- a/test/HeaderWriter.py
+++ b/test/HeaderWriter.py
@@ -20,8 +20,11 @@
 
 
 class HeaderWriter:
-    def __init__(self, incdir, tabwidth=4):
-        self.incdir = incdir
+    def __init__(self, gendir, tabwidth=4):
+        self.incdir = os.path.join(gendir, "inc")
+        os.makedirs(self.incdir, exist_ok=True)
+        self.srcdir = os.path.join(gendir, "src")
+        os.makedirs(self.srcdir, exist_ok=True)
         self.tabwidth = tabwidth
 
     def header_guard_begin(self, filename):
@@ -60,7 +63,7 @@ def vector_size(self, data):
     def vector_declaration(self, name, size, _type):
         retval = ""
         retval += self.define(f"{name}_size", size)
-        retval += f"PI_L1 {_type} {name}[{name.upper()}_SIZE]"
+        retval += f"{_type} {name}[{name.upper()}_SIZE]"
         return retval
 
     def vector_initial_value(self, data, elements_per_row=10):
@@ -92,8 +95,11 @@ def render_vector(self, name, size, _type, init=None, elements_per_row=10):
         retval += self.vector_end()
         return retval
 
+    def check_declaration(self, name):
+        return f"void check_{name}();\n\n"
+
     def check(self, name):
-        return f"""static void check_{name}() {{
+        return f"""void check_{name}() {{
         printf("Checking the {name} vector:\\n");
 
         int n_err = 0;
@@ -126,15 +132,41 @@ def generate_header(self, name, body):
             file.write(filerender)
 
     def generate_vector_header(self, name, size, _type, init=None, golden=None):
-        bodyrender = ""
-        bodyrender += self.includes
-        bodyrender += self.render_vector(name, _type, size, init=init)
+        render = ""
+        render += self.includes
+        render += self.render_vector(name, "extern " + _type, size)
+
+        if golden is not None:
+            render += self.render_vector("golden_" + name, "extern " + _type, size)
+            render += self.check_declaration(name)
+
+        self.generate_header(name, render)
+
+    def generate_source(self, name, body):
+        filename = name + ".c"
+        filepath = os.path.join(self.srcdir, filename)
+
+        print(f"Generating source file -> {filepath}")
+
+        with open(filepath, "w") as file:
+            file.write(body)
+
+    def generate_vector_source(self, name, size, _type, init=None, golden=None):
+        render = ""
+        render += f'#include "{name}.h"\n\n'
+        render += self.render_vector(name, "PI_L1 " + _type, size, init=init)
 
         if golden is not None:
-            bodyrender += self.render_vector("golden_" + name, _type, size, init=golden)
-            bodyrender += self.check(name)
+            render += self.render_vector(
+                "golden_" + name, "PI_L1 " + _type, size, init=golden
+            )
+            render += self.check(name)
+
+        self.generate_source(name, render)
 
-        self.generate_header(name, bodyrender)
+    def generate_vector_files(self, name, size, _type, init=None, golden=None):
+        self.generate_vector_source(name, size, _type, init, golden)
+        self.generate_vector_header(name, size, _type, init, golden)
 
     def render_dims(self, name, dims):
         retval = ""
diff --git a/test/Ne16TestClasses.py b/test/Ne16TestClasses.py
index 79b5867..d99e829 100644
--- a/test/Ne16TestClasses.py
+++ b/test/Ne16TestClasses.py
@@ -16,7 +16,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Union, Sequence, Optional, Set
+from __future__ import annotations
+from typing import List, Union, Optional, Set, Tuple
 import torch
 import numpy as np
 import torch.nn.functional as F
@@ -101,15 +102,15 @@ def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType
             Ne16TestConf._check_type("bias_type", v, ["int32"])
         return v
 
-    @model_validator(mode="after")
-    def check_valid_out_channel_with_stride_2x2(self) -> "Ne16TestConf":
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_out_channel_with_stride_2x2(self) -> Ne16TestConf:
         assert implies(
             self.stride == Stride(height=2, width=2), self.out_channel % 2 == 0
         ), f"With stride 2x2 supported only even output channel sizes. Given output channel {self.out_channel}"
         return self
 
-    @model_validator(mode="after")
-    def check_valid_depthwise(self) -> "Ne16TestConf":
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_depthwise(self) -> Ne16TestConf:
         assert implies(
             self.depthwise, self.kernel_shape == KernelShape(height=3, width=3)
         ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}."
@@ -119,8 +120,8 @@ def check_valid_depthwise(self) -> "Ne16TestConf":
         )
         return self
 
-    @model_validator(mode="after")
-    def check_valid_padding_with_kernel_shape_1x1(self) -> "Ne16TestConf":
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_padding_with_kernel_shape_1x1(self) -> Ne16TestConf:
         assert implies(
             self.kernel_shape == KernelShape(height=1, width=1),
             self.padding == Padding(top=0, bottom=0, left=0, right=0),
@@ -133,16 +134,16 @@ def check_valid_has_norm_quant(cls, v: bool) -> bool:
         assert v == True, f"Untested without has_norm_quant."
         return v
 
-    @model_validator(mode="after")
-    def check_valid_norm_quant_types_when_has_norm_qunat(self) -> "Ne16TestConf":
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_norm_quant_types_when_has_norm_qunat(self) -> Ne16TestConf:
         if self.has_norm_quant:
             assert self.scale_type is not None, "Scale type was not provided."
             if self.has_bias:
                 assert self.bias_type is not None, "Bias type was not provided."
         return self
 
-    @model_validator(mode="after")
-    def check_valid_out_type_with_flags(self) -> "Ne16TestConf":
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_out_type_with_flags(self) -> Ne16TestConf:
         assert implies(
             not self.has_norm_quant, self.out_type == Ne16.ACCUMULATOR_TYPE
         ), (
@@ -271,7 +272,7 @@ def _global_shift(
         return global_shift
 
     @staticmethod
-    def _random_data(_type: IntegerType, shape: Sequence[int]):
+    def _random_data(_type: IntegerType, shape: Tuple[int, int, int, int]):
         return torch.randint(_type.min, _type.max, size=shape)
 
     @staticmethod
@@ -393,12 +394,11 @@ def regenerate(test: Ne16Test, regen_tensors: Set[str]) -> Ne16Test:
 
 
 class Ne16TestHeaderGenerator:
-    DEFAULT_HEADERS_DIR = "app/gen_inc"
+    DEFAULT_HEADERS_DIR = "app/gen"
 
     def __init__(self, headers_dir: Optional[Union[str, os.PathLike]] = None):
         if headers_dir is None:
             headers_dir = Ne16TestHeaderGenerator.DEFAULT_HEADERS_DIR
-        os.makedirs(headers_dir, exist_ok=True)
         self.header_writer = HeaderWriter(headers_dir)
 
     def generate(self, test_name: str, test: Ne16Test):
@@ -409,14 +409,14 @@ def generate(self, test_name: str, test: Ne16Test):
         # Render input
         in_ctype = test.conf.in_type.ctype()
         in_data = test.input.permute(0, 2, 3, 1).ravel()
-        self.header_writer.generate_vector_header(
+        self.header_writer.generate_vector_files(
             "input", _type=in_ctype, size=in_data.numel(), init=in_data
         )
 
         # Render output
         out_ctype = test.conf.out_type.ctype()
         out_data_golden = test.output.permute(0, 2, 3, 1).ravel()
-        self.header_writer.generate_vector_header(
+        self.header_writer.generate_vector_files(
             "output",
             _type=out_ctype,
             size=out_data_golden.numel(),
@@ -436,7 +436,7 @@ def generate(self, test_name: str, test: Ne16Test):
             weight_type._bits,
             depthwise=test.conf.depthwise,
         )
-        self.header_writer.generate_vector_header(
+        self.header_writer.generate_vector_files(
             "weight", _type="uint8_t", size=weight_init.size, init=weight_init
         )
 
@@ -444,7 +444,7 @@ def generate(self, test_name: str, test: Ne16Test):
         if test.scale is not None:
             assert test.conf.scale_type is not None
             scale_ctype = test.conf.scale_type.ctype()
-            self.header_writer.generate_vector_header(
+            self.header_writer.generate_vector_files(
                 "scale",
                 _type=scale_ctype,
                 size=test.scale.numel(),
@@ -455,7 +455,7 @@ def generate(self, test_name: str, test: Ne16Test):
         if test.bias is not None:
             assert test.conf.bias_type is not None
             bias_ctype = test.conf.bias_type.ctype()
-            self.header_writer.generate_vector_header(
+            self.header_writer.generate_vector_files(
                 "bias", _type=bias_ctype, size=test.bias.numel(), init=test.bias.ravel()
             )
 
diff --git a/test/TestClasses.py b/test/TestClasses.py
index c56eec2..c10641c 100644
--- a/test/TestClasses.py
+++ b/test/TestClasses.py
@@ -52,7 +52,7 @@ class Padding(BaseModel):
 class IntegerType(BaseModel):
     name: str
 
-    @model_validator(mode="before")
+    @model_validator(mode="before")  # type: ignore
     @classmethod
     def model_validate_before(cls, data: Any) -> Dict:
         if isinstance(data, str):
@@ -122,5 +122,5 @@ def model_dump(
             exclude_none: bool = False,
             round_trip: bool = False,
             warnings: bool = True,
-        ) -> str:
+        ) -> dict[str, Any]:
             ...
diff --git a/test/app/Makefile b/test/app/Makefile
index e5051b1..14f30fd 100644
--- a/test/app/Makefile
+++ b/test/app/Makefile
@@ -16,43 +16,49 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-APP := main
+# Set runner_args="--trace=ne16" if you want to trace what is happening in the ne16
+
+ACCELERATOR ?= ne16
 
+APP := main
 LIBDIR := $(abspath ../..)
+ACC_DIR := $(LIBDIR)/$(ACCELERATOR)
 
 
 # Include directories
 
 ## Test
-INC_DIRS += inc gen_inc
+INC_DIRS += inc
 
-## PULP-NNX
-INC_DIRS += $(LIBDIR)/inc
+## Library
+INC_DIRS += $(LIBDIR)/inc $(LIBDIR)/util
 
-## NE16
-INC_DIRS += $(LIBDIR)/ne16/hal $(LIBDIR)/ne16/gvsoc
+## Accelerator
+INC_DIRS += $(ACC_DIR)/hal $(ACC_DIR)/gvsoc $(ACC_DIR)/bsp
 
-## Util
-INC_DIRS += $(LIBDIR)/util
+## Generated 
+INC_DIRS += gen/inc
 
 INC_FLAGS += $(addprefix -I,$(INC_DIRS))
 
-
 # Source files
 
 ## Test
 APP_SRCS += $(wildcard src/*.c)
 
-## PULP-NNX
-APP_SRCS += $(LIBDIR)/src/pulp_nnx_ne16.c
+## Library
+APP_SRCS += $(LIBDIR)/src/pulp_nnx_$(ACCELERATOR).c $(wildcard $(LIBDIR)/util/*.c)
+
+## Accelerator
+APP_SRCS += $(wildcard $(ACC_DIR)/hal/*.c) $(wildcard $(ACC_DIR)/gvsoc/*.c) $(wildcard $(ACC_DIR)/bsp/*.c)
 
-## NE16
-APP_SRCS += $(wildcard $(LIBDIR)/ne16/hal/*.c) $(wildcard $(LIBDIR)/ne16/gvsoc/*.c)
+## Generated 
+APP_SRCS += $(wildcard gen/src/*.c)
 
-## Util
-APP_SRCS += $(LIBDIR)/util/pulp_nnx_util.c
 
+# Flags
 
-APP_CFLAGS += $(INC_FLAGS) -O2 -w
+APP_CFLAGS += $(INC_FLAGS) -O2 -w -Wall -Werror -flto
+APP_LDFLAGS += -flto
 
 include $(RULES_DIR)/pmsis_rules.mk
diff --git a/test/app/src/main.c b/test/app/src/main.c
index 1f191b3..cc67050 100644
--- a/test/app/src/main.c
+++ b/test/app/src/main.c
@@ -22,8 +22,9 @@
 
 #include "layer_util.h"
 #include "nnx_layer.h"
+#include "output.h"
 
-void app_kickoff(void *args) {
+int main() {
   struct pi_device cl_dev;
   struct pi_cluster_conf cl_conf;
   struct pi_cluster_task cl_task;
@@ -47,7 +48,8 @@ void app_kickoff(void *args) {
   printf("\n");
   printf("Test %s finished\n", TEST_NAME);
 
-  pmsis_exit(0);
-}
+  printf("\n");
+  check_output();
 
-int main() { return pmsis_kickoff((void *)app_kickoff); }
+  return 0;
+}
diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index fe2924f..ffd93a1 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -18,13 +18,13 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include <pmsis.h>
-
 #include "nnx_layer.h"
-#include "pulp_nnx.h"
-
-#include "ne16_gvsoc_logging.h"
-#include "ne16_hal.h"
+#include "ne16.h"
+#include "ne16_gvsoc.h"
+#include "ne16_pulp_bsp.h"
+#include "ne16_task.h"
+#include "pulp_nnx_ne16.h"
+#include <pmsis.h>
 
 // Generated headers
 #include "bias.h"
@@ -34,65 +34,73 @@
 #include "scale.h"
 #include "weight.h"
 
-void execute_nnx_layer(void *unused_args) {
-  ne16_activate_gvsoc_logging(NE16_GVSOC_LOG_LEVEL_ALL,
-                              NE16_GVSOC_LOGGING_FORMAT_HEXADECIMAL);
-  const int nnx_max_stall = 8;
-  nnx_init(nnx_max_stall);
+static void task_prepare(ne16_task_t *task) {
+  ne16_task_init(task, WEIGHT_HEIGHT, GROUPS > 1, INPUT_BITS, OUTPUT_BITS,
+                 WEIGHT_BITS, weightOffsetModeLayerWise, WEIGHT_OFFSET,
+                 (ne16_quant_t){.shift_amount = OUTSHIFT,
+                                .mode = quantMode8Bit,
+                                .function = HAS_RELU ? quantFunctionRelu
+                                                     : quantFunctionIdentity,
+                                .flag_rounding = ne16TaskFlagFalse},
+                 (ne16_norm_t){.mode = normMode8Bit,
+                               .flag_bias = HAS_BIAS ? ne16TaskFlagTrue
+                                                     : ne16TaskFlagFalse,
+                               .flag_shift = ne16TaskFlagFalse},
+                 STRIDE_HEIGHT);
 
-  nnx_task_t task;
-  nnx_task_init(
-      &task, WEIGHT_HEIGHT, GROUPS > 1, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS,
-      weightOffsetModeLayerWise, WEIGHT_OFFSET,
-      (nnx_quant_t){.shift_amount = OUTSHIFT,
-                    .mode = quantMode8Bit,
-                    .function =
-                        HAS_RELU ? quantFunctionRelu : quantFunctionIdentity,
-                    .flag_rounding = NE16_FLAG_UNUSED},
-      (nnx_norm_t){.mode = normMode8Bit,
-                   .flag_bias = HAS_BIAS ? NE16_FLAG_USED : NE16_FLAG_UNUSED,
-                   .flag_shift = NE16_FLAG_UNUSED},
-      STRIDE_HEIGHT);
-
-  if (STRIDE_HEIGHT == 1) {
-    nnx_task_set_dims(&task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
-                      INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
-                      OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, PADDING_TOP,
-                      PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT);
-  } else {
-    nnx_task_set_dims_stride2x2(
-        &task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
+  if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) {
+    ne16_task_set_dims_stride2x2(
+        task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
         INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL,
         OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP,
         PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT);
+  } else {
+    ne16_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
+                       INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
+                       OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, PADDING_TOP,
+                       PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT);
   }
 
-  nnx_task_set_ptrs(&task, input, INPUT_WIDTH, INPUT_CHANNEL, INPUT_BITS,
-                    PADDING_TOP, PADDING_LEFT, output, weight, scale, NULL,
+  ne16_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, INPUT_CHANNEL,
+                     INPUT_BITS, PADDING_TOP, PADDING_LEFT, (uint32_t)output,
+                     (uint32_t)weight, (uint32_t)scale, NULL,
 #if HAS_BIAS == 1
-                    bias
+                     (uint32_t)bias
 #else
-                    NULL
+                     NULL
 #endif
   );
+}
 
-  nnx_dispatch_check_blocking();
+static void task_execute(ne16_task_t *task) {
+  ne16_dev_t *dev = ne16_pulp_get_dev();
 
-  if (STRIDE_HEIGHT == 1) {
-    nnx_dispatch_task(&task);
-  } else {
-    nnx_dispatch_task_stride2x2(&task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
+  ne16_gvsoc_log_activate(dev, NE16_GVSOC_LOG_LEVEL_CONFIG,
+                          NE16_GVSOC_LOG_FORMAT_HEXADECIMAL);
+
+  ne16_pulp_conf_t conf = {.max_stall = 8};
+  ne16_nnx_init(dev, &conf);
+
+  ne16_nnx_dispatch_wait(dev);
+
+  if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) {
+    ne16_nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
                                 INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
                                 OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL,
                                 WEIGHT_HEIGHT, WEIGHT_WIDTH);
+  } else {
+    ne16_nnx_dispatch(dev, task);
   }
-  // nnx_resolve_check_blocking(&task);
-  while (!ne16_empty())
-    ne16_event_wait();
 
-  nnx_term();
-  ne16_deactivate_gvsoc_logging();
+  ne16_nnx_resolve_wait(dev, task);
+
+  ne16_nnx_term(dev);
+
+  ne16_gvsoc_log_deactivate(dev);
+}
 
-  printf("\n");
-  check_output();
+void execute_nnx_layer(void *args) {
+  ne16_task_t task;
+  task_prepare(&task);
+  task_execute(&task);
 }
diff --git a/test/conftest.py b/test/conftest.py
index 7ed485c..6c2c15b 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -49,7 +49,7 @@ def pytest_addoption(parser):
         "--timeout",
         type=int,
         default=120,
-        help="Execution timeout in seconds. Default: 120s"
+        help="Execution timeout in seconds. Default: 120s",
     )
 
 
diff --git a/test/test.py b/test/test.py
index 461db19..39709b6 100644
--- a/test/test.py
+++ b/test/test.py
@@ -22,6 +22,7 @@
 import locale
 import subprocess
 from Ne16TestClasses import Ne16Test, Ne16TestHeaderGenerator
+from pathlib import Path
 
 HORIZONTAL_LINE = "\n" + "-" * 100 + "\n"
 
@@ -99,6 +100,7 @@ def test(path: str, timeout: int):
 
     Ne16TestHeaderGenerator().generate(test_name, test)
 
+    Path("app/src/nnx_layer.c").touch()
     cmd = f"make -C app all run platform=gvsoc"
     passed, msg, stdout, stderr = execute_command(cmd=cmd, timeout=timeout)
 
diff --git a/util/hwpe.c b/util/hwpe.c
new file mode 100644
index 0000000..53c1ace
--- /dev/null
+++ b/util/hwpe.c
@@ -0,0 +1,85 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "hwpe.h"
+#include <stdint.h>
+
+#define HWPE_TRIGGER 0
+#define HWPE_ACQUIRE 1
+#define HWPE_FINISHED 2
+#define HWPE_STATUS 3
+#define HWPE_RUNNING_JOB 4
+#define HWPE_SOFT_CLEAR 5
+#define HWPE_SWSYNC 6
+#define HWPE_TASK_REG_OFFSET 8
+
+inline void hwpe_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) {
+  *(dev->base_addr + reg) = value;
+}
+
+inline uint32_t hwpe_reg_read(hwpe_dev_t *dev, int reg) {
+  return *(dev->base_addr + reg);
+}
+
+inline void hwpe_task_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) {
+  hwpe_reg_write(dev, HWPE_TASK_REG_OFFSET + reg, value);
+}
+
+inline uint32_t hwpe_task_reg_read(hwpe_dev_t *dev, int reg) {
+  return hwpe_reg_read(dev, HWPE_TASK_REG_OFFSET + reg);
+}
+
+void hwpe_soft_clear(hwpe_dev_t *dev) {
+  hwpe_reg_write(dev, HWPE_SOFT_CLEAR, 0);
+  for (volatile int i = 0; i < 10; i++)
+    ;
+}
+
+uint32_t hwpe_task_queue_status(hwpe_dev_t *dev) {
+  return hwpe_reg_read(dev, HWPE_STATUS);
+}
+
+int hwpe_task_queue_acquire_task(hwpe_dev_t *dev, uint8_t *id) {
+  uint32_t read_value = (int32_t)hwpe_reg_read(dev, HWPE_ACQUIRE);
+  if (read_value >= 256) {
+    return 1;
+  } else {
+    *id = (uint8_t)read_value;
+    return 0;
+  }
+}
+
+void hwpe_task_queue_write_task(hwpe_dev_t *dev, uint32_t *data, int len) {
+  for (int i = 0; i < len; i++) {
+    hwpe_task_reg_write(dev, i, data[i]);
+  }
+}
+
+void hwpe_task_queue_release_and_run(hwpe_dev_t *dev) {
+  hwpe_reg_write(dev, HWPE_TRIGGER, 0);
+}
+
+void hwpe_task_queue_release(hwpe_dev_t *dev) {
+  hwpe_reg_write(dev, HWPE_TRIGGER, 1);
+}
+
+uint8_t hwpe_last_task_id(hwpe_dev_t *dev) {
+  return (uint8_t)hwpe_reg_read(dev, HWPE_RUNNING_JOB);
+}
diff --git a/util/hwpe.h b/util/hwpe.h
new file mode 100644
index 0000000..52bf912
--- /dev/null
+++ b/util/hwpe.h
@@ -0,0 +1,43 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __HWPE_H__
+#define __HWPE_H__
+
+#include <stdint.h>
+
+/* HWPE device */
+typedef struct hwpe_dev_t {
+  volatile uint32_t *base_addr;
+} hwpe_dev_t;
+
+void hwpe_reg_write(hwpe_dev_t *dev, int reg, uint32_t value);
+uint32_t hwpe_reg_read(hwpe_dev_t *dev, int reg);
+void hwpe_task_reg_write(hwpe_dev_t *dev, int reg, uint32_t value);
+uint32_t hwpe_task_reg_read(hwpe_dev_t *dev, int reg);
+void hwpe_soft_clear(hwpe_dev_t *dev);
+uint32_t hwpe_task_queue_status(hwpe_dev_t *dev);
+int hwpe_task_queue_acquire_task(hwpe_dev_t *dev, uint8_t *id);
+void hwpe_task_queue_write_task(hwpe_dev_t *dev, uint32_t *data, int len);
+void hwpe_task_queue_release_and_run(hwpe_dev_t *dev);
+void hwpe_task_queue_release(hwpe_dev_t *dev);
+uint8_t hwpe_last_task_id(hwpe_dev_t *dev);
+
+#endif // !__HWPE_H__