diff --git a/Makefile b/Makefile
index 47522c107..bcd4d7138 100644
--- a/Makefile
+++ b/Makefile
@@ -218,6 +218,7 @@ toolchain/riscv-opcodes/*:
 
 format:
 	$(ROOT_DIR)/scripts/run_clang_format.py --clang-format-executable=$(LLVM_INSTALL_DIR)/bin/clang-format -i -r $(ROOT_DIR)
+	find ./software/data -name '*.py' -exec autopep8 --in-place --aggressive {} +
 
 clean: clean-riscv-tests
 	rm -rf $(INSTALL_DIR)
diff --git a/python-requirements.txt b/python-requirements.txt
index 09e19ccd7..d0e903cda 100644
--- a/python-requirements.txt
+++ b/python-requirements.txt
@@ -14,3 +14,4 @@ pandas
 progressbar2
 tabulate
 sympy
+scipy
diff --git a/software/.gitignore b/software/.gitignore
index 49abad0af..35dccde4a 100644
--- a/software/.gitignore
+++ b/software/.gitignore
@@ -27,3 +27,4 @@ runtime/arch.ld
 # Generated data files
 data.h
 data/data*.h
+data/__pyc*
diff --git a/software/apps/baremetal/Makefile b/software/apps/baremetal/Makefile
index cc9e2db7a..c4a2a40a3 100644
--- a/software/apps/baremetal/Makefile
+++ b/software/apps/baremetal/Makefile
@@ -17,8 +17,6 @@ RUNTIME_DIR := $(abspath $(SOFTWARE_DIR)/runtime)
 include $(RUNTIME_DIR)/runtime.mk
 
 APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find $(APPS_DIR) -name "main.c"))
-DATA := $(patsubst %.args,%.h,$(shell find $(APPS_DIR) -name "data.args"))
-ALLPYS := $(patsubst %.py,%.h,$(wildcard $(DATA_DIR)/*.py))
 BINARIES := $(addprefix $(BIN_DIR)/,$(APPS))
 ALL := $(APPS)
 
@@ -34,7 +32,7 @@ all_llvm: $(ALL_LLVM)
 $(APPS): % : $(BIN_DIR)/% $(APPS_DIR)/Makefile $(shell find $(RUNTIME_DIR)/**.{S,c,h,ld} -type f)
 
 .PHONY: $(BINARIES)
-$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) $(DATA) $(ALLPYS) update_opcodes
+$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) data_%.h update_opcodes
 	mkdir -p $(dir $@)
 	$(RISCV_CC) -Iinclude -o $@ $< $(RUNTIME) $(RISCV_LDFLAGS) -T$(RUNTIME_DIR)/link.ld
 	$(RISCV_OBJDUMP) $(RISCV_OBJDUMP_FLAGS) -D $@ > $@.dump
@@ -50,5 +48,6 @@ clean:
 	rm -vf $(addsuffix /main.c.o,$(APPS))
 	rm -vf $(RUNTIME)
 	rm -vf $(LINKER_SCRIPT)
+	rm -vf $(wildcard $(DATA_DIR)/data_*.h)
 
 .INTERMEDIATE: $(addsuffix /main.c.o,$(APPS))
diff --git a/software/apps/baremetal/axpy_i32/main.c b/software/apps/baremetal/axpy_i32/main.c
index a9354796e..c391ba040 100644
--- a/software/apps/baremetal/axpy_i32/main.c
+++ b/software/apps/baremetal/axpy_i32/main.c
@@ -5,125 +5,50 @@
 // Author: Yichao Zhang, ETH Zurich
 
 #include <stdint.h>
+#include <stdlib.h>
 #include <string.h>
 
-#include "baremetal/mempool_axpy_i32p.h"
+/* Mempool runtime libraries */
+#include "builtins_v2.h"
+#include "dma.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
-#include <stdlib.h>
-
-#if NUM_CORES > 32
-#define size_M 64
-#define size_N 64
-#else
-#define size_M (NUM_CORES)
-#define size_N (NUM_CORES)
-#endif
-
-#define ALPHA 2
 
-#if NUM_CORES > 32
-int32_t data_x[size_M * size_N]
-    __attribute__((aligned(64 * 1024), section(".l1")));
-int32_t data_y[size_M * size_N]
-    __attribute__((aligned(64 * 1024), section(".l1")));
-int32_t data_y_copy[size_M * size_N]
-    __attribute__((aligned(64 * 1024), section(".l1")));
-#else
-int32_t data_x[size_M * size_N] __attribute__((aligned(32), section(".l1")));
-int32_t data_y[size_M * size_N] __attribute__((aligned(32), section(".l1")));
-int32_t data_y_copy[size_M * size_N]
-    __attribute__((aligned(32), section(".l1")));
-#endif
+#include "baremetal/mempool_axpy_i32p.h"
+#include "baremetal/mempool_checks.h"
+#include "data_axpy_i32.h"
 
+int32_t l1_X[array_N]
+    __attribute__((aligned(NUM_CORES * sizeof(uint32_t)), section(".l1")));
+int32_t l1_Y[array_N]
+    __attribute__((aligned(NUM_CORES * sizeof(uint32_t)), section(".l1")));
 int volatile error __attribute__((section(".l1")));
 
-void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                 int32_t a, int32_t b, int32_t c, uint32_t core_id,
-                 uint32_t num_cores) {
-  // How many rows/columns to split the matrix into
-  uint32_t const split = 8;
-  if (num_columns > num_rows) {
-    // Parallelize over columns
-    uint32_t const c_start = (num_rows / split) * (core_id % split);
-    uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1);
-    for (uint32_t j = (core_id / split); j < num_columns;
-         j += (num_cores / split)) {
-      for (uint32_t i = c_start; i < c_end; ++i) {
-        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
-      }
-    }
-  } else {
-    // Parallelize over rows
-    uint32_t const c_start = (num_columns / split) * (core_id % split);
-    uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1);
-    for (uint32_t i = (core_id / split); i < num_rows;
-         i += (num_cores / split)) {
-      for (uint32_t j = c_start; j < c_end; ++j) {
-        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
-      }
-    }
-  }
-}
-
-int verify_axpy(int32_t *matrix_X, int32_t *matrix_Y, int32_t *matrix_Y_COPY,
-                int32_t alpha, uint32_t elements) {
-  for (uint32_t i = 0; i < elements; i++) {
-    if (matrix_Y[i] != matrix_X[i] * alpha + matrix_Y_COPY[i]) {
-      return 1;
-    }
-  }
-  return 0;
-}
-
 int main() {
 
   uint32_t const core_id = mempool_get_core_id();
   uint32_t const num_cores = mempool_get_core_count();
-  uint32_t const total_elements = size_M * size_N;
-
-  // Seed for create element matrix
-  int32_t const A_a = 1;
-  int32_t const A_b = 1;
-  int32_t const A_c = -32;
-  int32_t const B_a = 2;
-  int32_t const B_b = 1;
-  int32_t const B_c = 16;
-
-  // Initialize synchronization variables
   mempool_barrier_init(core_id);
+
+  // Initialize data
   if (core_id == 0) {
-    printf("Initialize %3d cores\n", num_cores);
+    dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int32_t));
+    dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int32_t));
     error = 0;
   }
-
-  // init_elements;
-  init_matrix(data_x, size_M, size_N, A_a, A_b, A_c, core_id, num_cores);
-  init_matrix(data_y, size_M, size_N, B_a, B_b, B_c, core_id, num_cores);
-  init_matrix(data_y_copy, size_M, size_N, B_a, B_b, B_c, core_id, num_cores);
   mempool_barrier(num_cores);
 
-  // start kernel testing
+  // Benchmark
   mempool_start_benchmark();
-  calc_axpy_unloop_x4_localbank(data_x, data_y, ALPHA, total_elements, core_id,
-                                num_cores);
+  calc_axpy_unloop_x4_localbank(l1_X, l1_Y, ALPHA, array_N, core_id, num_cores);
   mempool_barrier(num_cores);
   mempool_stop_benchmark();
-  // end kernel testing
 
   // Verify results
-  if (core_id == 0) {
-    printf("START CHECKING RESULTS\n");
-    if (verify_axpy(data_x, data_y, data_y_copy, ALPHA, total_elements)) {
-      printf("RESULTS ERROR\n");
-      error = 1;
-    } else {
-      printf("RESULTS CORRECT\n");
-    }
-  }
+  mempool_check_i32(l1_Y, l2_Z, array_N, 0, 0);
   mempool_barrier(num_cores);
 
-  return error;
+  return 0;
 }
diff --git a/software/apps/baremetal/cfft_radix2_q16/main.c b/software/apps/baremetal/cfft_radix2_q16/main.c
index 105cf6370..e23fb929e 100644
--- a/software/apps/baremetal/cfft_radix2_q16/main.c
+++ b/software/apps/baremetal/cfft_radix2_q16/main.c
@@ -19,6 +19,7 @@
 #include "synchronization.h"
 
 #include "data_cfft_radix2_q16.h"
+#define N_BANKS (NUM_CORES * BANKING_FACTOR)
 
 /* CFFT mempool libraries */
 #include "baremetal/mempool_cfft_q16_bitreversal.h"
@@ -69,7 +70,7 @@ int main() {
   mempool_stop_benchmark();
 #endif
 
-  mempool_check_q16(l1_pSrc, l2_pRes, 2 * N_CSAMPLES, TOLERANCE, 0);
+  mempool_check_i16(l1_pSrc, l2_pRes, 2 * N_CSAMPLES, TOLERANCE, 0);
   mempool_barrier(num_cores);
   return 0;
 }
diff --git a/software/apps/baremetal/cfft_radix4_q16/main.c b/software/apps/baremetal/cfft_radix4_q16/main.c
index 88d7182fa..08ed80e9b 100644
--- a/software/apps/baremetal/cfft_radix4_q16/main.c
+++ b/software/apps/baremetal/cfft_radix4_q16/main.c
@@ -19,6 +19,8 @@
 
 /* CFFT data libraries */
 #include "data_cfft_radix4_q16.h"
+#define N_BANKS (NUM_CORES * BANKING_FACTOR)
+#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
 
 /* CHOOSE ONE */
 //#define SINGLE // Single core FFT.
@@ -225,7 +227,7 @@ int main() {
     printf("02: END COMPUTATION\n");
   }
 
-  mempool_check_q16(pRes, l2_pRes, 2 * N_CSAMPLES, TOLERANCE, 0);
+  mempool_check_i16(pRes, l2_pRes, 2 * N_CSAMPLES, TOLERANCE, 0);
   mempool_barrier(num_cores);
   return 0;
 }
diff --git a/software/apps/baremetal/chest_q16/main.c b/software/apps/baremetal/chest_q16/main.c
index eecac204a..498c60260 100644
--- a/software/apps/baremetal/chest_q16/main.c
+++ b/software/apps/baremetal/chest_q16/main.c
@@ -62,7 +62,7 @@ int main() {
 #endif
 
   /* Check */
-  mempool_check_q16(l1_HEST, l2_HEST, 2 * N_TX * N_RX, 0, 0);
+  mempool_check_i16(l1_HEST, l2_HEST, 2 * N_TX * N_RX, 0, 0);
   mempool_barrier(num_cores);
   return 0;
 }
diff --git a/software/apps/baremetal/cholesky_q32/initialization.h b/software/apps/baremetal/cholesky_q32/initialization.h
deleted file mode 100644
index 79993afa8..000000000
--- a/software/apps/baremetal/cholesky_q32/initialization.h
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-#define FIXED_POINT 10
-#define HALF 1023
-#define FIX_DIV(a, b) ((int32_t)((a << FIXED_POINT) / b))
-#define FIX_MUL(a, b) ((int32_t)((a * b + HALF) >> FIXED_POINT))
-#define ABS(a) (a > 0 ? a : -a)
-
-void transpose(int32_t *matrix, int32_t *t_matrix, int32_t n);
-void matrixmult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product,
-                int32_t n);
-void display(int32_t *matrix, uint32_t num_rows, uint32_t num_columns);
-void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                 int32_t a, int32_t b, int32_t c, uint32_t core_id);
-void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                       uint32_t core_id);
-
-void transpose(int32_t *matrix, int32_t *t_matrix, int32_t n) {
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < n; j++) {
-      t_matrix[j * n + i] = matrix[i * n + j];
-    }
-  }
-}
-
-void matrixmult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product,
-                int32_t n) {
-  int k;
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < n; j++) { // not j < M
-      matrix_product[i * n + j] = 0;
-      for (k = 0; k < n; k++) {
-        matrix_product[i * n + j] +=
-            FIX_MUL(matrix_1[i * n + k], matrix_2[k * n + j]);
-      }
-    }
-  }
-}
-
-void display(int32_t *matrix, uint32_t num_rows, uint32_t num_columns) {
-#if defined(FOLDED)
-  uint32_t i, j;
-  for (i = 0; i < num_rows; i++) {
-    for (j = 0; j < num_columns; j++) {
-      printf("%8d", matrix[i * N_BANKS + j]);
-    }
-    printf("\n");
-  }
-#else
-  uint32_t i, j;
-  for (i = 0; i < num_rows; i++) {
-    for (j = 0; j < num_columns; j++) {
-      printf("%8d ", matrix[i * num_columns + j]);
-    }
-    printf("\n");
-  }
-#endif
-}
-
-void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                 int32_t a, int32_t b, int32_t c, uint32_t core_id) {
-  if (core_id == 0) {
-    for (uint32_t j = 0; j < num_rows; j++) {
-      for (uint32_t i = 0; i < num_columns; i++) {
-        matrix[j * num_columns + i] = a * (int32_t)i + b * (int32_t)j + c;
-      }
-    }
-  }
-}
-
-void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                       uint32_t core_id) {
-  if (core_id == 0) {
-    for (uint32_t i = 0; i < num_columns; i++) {
-      for (uint32_t j = 0; j < num_rows; j++) {
-        matrix[j * num_columns + i] = 0;
-      }
-    }
-  }
-}
diff --git a/software/apps/baremetal/cholesky_q32/main.c b/software/apps/baremetal/cholesky_q32/main.c
index a670da805..64fbf3b2f 100644
--- a/software/apps/baremetal/cholesky_q32/main.c
+++ b/software/apps/baremetal/cholesky_q32/main.c
@@ -4,180 +4,126 @@
 
 // Author: Marco Bertuletti, ETH Zurich
 
+#include "dma.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
 
-#define N_BANKS (NUM_CORES * 4)
-/* Matrix dimension */
-#define N 4
-
+#define HALF (1023)
+#define N_BANKS (NUM_CORES * BANKING_FACTOR)
+#define FIX_DIV(a, b) ((int32_t)((a << FIXED_POINT) / b))
+#define FIX_MUL(a, b) ((int32_t)((a * b + HALF) >> FIXED_POINT))
+#define ABS(a) (a > 0 ? a : -a)
 #define SINGLE
 //#define PARALLEL
 //#define SCHEDULING
-//#define LINSOLVE4
+//#define LINSOLVER
 
-#define N_COL 1
-#define N_ROW 1
-#define N_ITR 1
+#include "data_cholesky_q32.h"
+
+#include "baremetal/mempool_cholesky_q32p.h"
+#include "baremetal/mempool_cholesky_q32s.h"
+#include "baremetal/mempool_linearsolver_q32p.h"
+#include "baremetal/mempool_linearsolver_q32s.h"
 
-int32_t A_matrix[N * N] __attribute__((aligned(N_BANKS), section(".l1")));
-int32_t AT_matrix[N * N] __attribute__((aligned(N_BANKS), section(".l1")));
-int32_t M_matrix[N * N] __attribute__((aligned(N_BANKS), section(".l1")));
 #ifndef SCHEDULING
-int32_t L_matrix[N * N] __attribute__((aligned(N_BANKS), section(".l1")));
-int32_t In[N] __attribute__((aligned(N_BANKS), section(".l1")));
-#else
-// Matrices to generate the hermitian
-int32_t In_matrix[N * N_BANKS]
-    __attribute__((aligned(N_BANKS), section(".l1")));
-// Outputs and input vector for linear system solution
-int32_t LL_matrix[N_ROW * N * N_BANKS]
+#define N_COL 1
+#define N_ROW 1
+int32_t l1_A[matrix_N * matrix_N]
     __attribute__((aligned(N_BANKS), section(".l1")));
-int32_t LR_matrix[N_ROW * N * N_BANKS]
+int32_t l1_L[matrix_N * matrix_N]
     __attribute__((aligned(N_BANKS), section(".l1")));
-int32_t In[N_BANKS] __attribute__((aligned(N_BANKS), section(".l1")));
+int32_t l1_y[matrix_N] __attribute__((aligned(N_BANKS), section(".l1")));
+#else
+int32_t l1_AA[matrix_N * N_BANKS]
+    __attribute__((aligned(N_BANKS), section(".l1_prio")));
+int32_t l1_LL[N_ROW * matrix_N * N_BANKS]
+    __attribute__((aligned(N_BANKS), section(".l1_prio")));
+int32_t l1_LR[N_ROW * matrix_N * N_BANKS]
+    __attribute__((aligned(N_BANKS), section(".l1_prio")));
+int32_t l1_yy[N_BANKS] __attribute__((aligned(N_BANKS), section(".l1_prio")));
 #endif
 
-#include "initialization.h"
-
-#include "baremetal/mempool_cholesky_q32s.h"
-#include "baremetal/mempool_linearsolver_q32s.h"
-
-#include "baremetal/mempool_cholesky_q32p.h"
-#include "baremetal/mempool_linearsolver_q32p.h"
-
-void initialize() {
+int main() {
 
   uint32_t core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier_init(core_id);
 
-  /* Initialize input matrices */
-  init_matrix(A_matrix, N, N, -156, 427, -219, core_id);
-  init_matrix_zeros(AT_matrix, N, N, core_id);
-  init_matrix_zeros(M_matrix, N, N, core_id);
-
-#ifndef SCHEDULING
-
-  init_matrix_zeros(L_matrix, N, N, core_id);
-  mempool_barrier(num_cores);
-  /* Create positive definite matrix */
-  if (core_id == 0) {
-    transpose(A_matrix, AT_matrix, N);
-    matrixmult(AT_matrix, A_matrix, M_matrix, N);
-    printf("Done initialization.\n");
-  }
-  mempool_barrier(num_cores);
-#ifdef LINEARSOLVER
-  init_matrix(In, 1, N, -156, 427, -219, core_id);
-  mempool_barrier(num_cores);
-#endif
-
-#else
-
-  init_matrix_zeros(In_matrix, N, N_BANKS, core_id);
-  init_matrix_zeros(LL_matrix, N_ROW * N, N_BANKS, core_id);
-  init_matrix_zeros(LR_matrix, N_ROW * N, N_BANKS, core_id);
-  mempool_barrier(num_cores);
-  /* Create positive definite matrix */
+// Initialize
+#if defined(SCHEDULING)
   if (core_id == 0) {
-    transpose(A_matrix, AT_matrix, N);
-    matrixmult(AT_matrix, A_matrix, M_matrix, N);
-    for (uint32_t idx_col = 0; idx_col < N_COL; idx_col++) {
-      for (uint32_t i = 0; i < N; i++) {
-        for (uint32_t j = 0; j < N; j++) {
-          In_matrix[idx_col * N + i * N_BANKS + j] = M_matrix[i * N + j];
+    for (uint32_t i = 0; i < matrix_N; i++) {
+      for (uint32_t idx_col = 0; idx_col < N_COL; idx_col++) {
+        l1_yy[idx_col * matrix_N + i] = l2_y[i];
+        for (uint32_t j = 0; j < matrix_N; j++) {
+          l1_AA[idx_col * matrix_N + i * N_BANKS + j] = l2_A[i * matrix_N + j];
         }
       }
     }
-    printf("Done initialization.\n");
+    for (uint32_t i = 0; i < N_ROW * matrix_N * N_BANKS; i++) {
+      l1_LL[i] = 0;
+      l1_LR[i] = 0;
+    }
+  }
+#else
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_A, l2_A, matrix_N * matrix_N * sizeof(int32_t));
+    dma_memcpy_blocking(l1_L, l2_L, matrix_N * matrix_N * sizeof(int32_t));
+    dma_memcpy_blocking(l1_y, l2_y, matrix_N * sizeof(int32_t));
   }
-  mempool_barrier(num_cores);
-#ifdef LINEARSOLVER
-  init_matrix(In, 1, N_BANKS, -156, 427, -219, core_id);
-  mempool_barrier(num_cores);
-#endif
-
 #endif
-  return;
-}
-
-/* BENCHMARK */
-
-int main() {
-
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  mempool_barrier_init(core_id);
-
-  initialize();
+  mempool_barrier(num_cores);
 
+  // Benchmark
 #if defined(SINGLE)
   if (core_id == 0) {
     mempool_start_benchmark();
-    for (uint32_t i = 0; i < N_ITR; i++) {
-#ifndef LINEARSOLVER
-      // TEST #1 SINGLE-CORE CHOLESKY DECOMPOSITION
-      mempool_cholesky_crout_q32s(M_matrix, L_matrix, N);
-#else
-      // TEST #2 SINGLE-CORE LINEAR-SYSTEM SOLUTION
-      mempool_linearsolver_q32s(M_matrix, L_matrix, In, N);
-      mempool_uprtrisolver_q32s(L_matrix, In, N);
-#endif
-    }
+    // TEST #1 SINGLE-CORE CHOLESKY DECOMPOSITION
+    mempool_cholesky_crout_q32s(l1_A, l1_L, matrix_N);
+    // // TEST #2 SINGLE-CORE LINEAR-SYSTEM SOLUTION
+    // mempool_linearsolver_q32s(l1_A, l1_L, l1_y, matrix_N);
+    // mempool_uprtrisolver_q32s(l1_L, l1_y, matrix_N);
     mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
 #endif
 
-#if defined(PARALLEL) && !defined(SCHEDULING)
-#ifndef LINEARSOLVER
+#if defined(PARALLEL)
   // TEST #3 PARALLEL CHOLESKY DECOMPOSITION
+  // No trivial parallelization of linearsolver kernels
   mempool_start_benchmark();
-  mempool_cholesky_q32p(M_matrix, L_matrix, N);
+  mempool_cholesky_q32p(l1_A, l1_L, matrix_N);
   mempool_stop_benchmark();
   mempool_barrier(num_cores);
-#else
-// No trivial parallelization of linearsolver kernels
-#endif
-  mempool_barrier(num_cores);
 #endif
 
-#if defined(PARALLEL) && defined(SCHEDULING)
-  uint32_t nPE = (N / 4);
-  if (nPE > 1) {
-    /* Each decomposition is finely-grained parallelized over multiple cores */
-    if (core_id < N_COL * nPE) {
-      mempool_start_benchmark();
-#ifndef LINEARSOLVER
-      // TEST #4 FINE-GRAINED PARALLEL CHOLESKY DECOMPOSITION x N_ROW x N_COL
-      mempool_cholesky_fold_schedule_q32p(In_matrix, In_matrix, LL_matrix,
-                                          LR_matrix, N, N_ROW, N_COL);
-#else
-      // TEST #5 FINE-GRAINED PARALLEL LINEAR-SYSTEM SOLUTION x N_ROW x N_COL
-      mempool_linearsolver_fold_q32p(In_matrix, In_matrix, LL_matrix, LR_matrix,
-                                     In, N, N_ROW, N_COL);
-#endif
-      mempool_stop_benchmark();
-    }
-  }
-  if (nPE == 1) {
-    /* The decomposition is executed with a single-core. Each core gets a
-     * different input problem. This is the specific case of the 4x4 matrix. */
-    if (core_id < N_COL * nPE) {
-      mempool_start_benchmark();
-#ifndef LINEARSOLVER
-      // TEST #6 SINGLE-CORE CHOLESKY DECOMPOSITION x N_ROW x N_COL
-      mempool_cholesky_schedule_q32s(In_matrix, LL_matrix, N, N_ROW, N_COL);
-#else
-      // TEST #7 SINGLE-CORE LINEAR-SYSTEM SOLUTION x N_ROW x N_COL
-      mempool_linearsolver_q32s(In_matrix, LL_matrix, In, N, N_ROW, N_COL);
-#endif
-      mempool_stop_benchmark();
-    }
+#if defined(SCHEDULING)
+  /* Each decomposition is finely-grained parallelized over multiple cores */
+  uint32_t nPE = (matrix_N / 4);
+  if ((nPE > 1) && (core_id < N_COL * nPE)) {
+    mempool_start_benchmark();
+    // TEST #4 FINE-GRAINED PARALLEL CHOLESKY DECOMPOSITION x N_ROW x N_COL
+    mempool_cholesky_fold_schedule_q32p(l1_AA, l1_AA, l1_LL, l1_LR, matrix_N,
+                                        N_ROW, N_COL);
+    // // TEST #5 FINE-GRAINED PARALLEL LINEAR-SYSTEM SOLUTION x N_ROW x N_COL
+    // mempool_linearsolver_fold_q32p(l1_AA, l1_AA, l1_LL, l1_LR, l1_yy,
+    // matrix_N, N_ROW, N_COL);
+    mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
+
+  /* The decomposition is executed with a single-core. Each core gets a
+   * different input problem. This is the specific case of the 4x4 matrix. */
+  if ((nPE == 1) && (core_id < N_COL * nPE)) {
+    mempool_start_benchmark();
+    // TEST #6 SINGLE-CORE CHOLESKY DECOMPOSITION x N_ROW x N_COL
+    mempool_cholesky_schedule_q32s(l1_AA, l1_LL, matrix_N, N_ROW, N_COL);
+    // // TEST #7 SINGLE-CORE LINEAR-SYSTEM SOLUTION x N_ROW x N_COL
+    // mempool_linearsolver_q32s(l1_AA, l1_LL, l1_yy, matrix_N, N_ROW, N_COL);
+    mempool_stop_benchmark();
+  }
 #endif
 
   return 0;
diff --git a/software/apps/baremetal/dotp_i32/define.h b/software/apps/baremetal/dotp_i32/define.h
deleted file mode 100644
index d2b069d21..000000000
--- a/software/apps/baremetal/dotp_i32/define.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-#define LEN (1024)
-#define N_PE (NUM_CORES)
-#define N_BANK (NUM_CORES * 4)
-#define N_BANK_PE (N_PE * 4)
-
-/* Enable log barriers */
-#define LOG_BARRIERS
-
-/* STEP core 0 reduction */
-#define STEP (256)
-#define STEP_CORES (STEP / 4)
-
-//////////////////////////////////
-/*          SELECT ONE          */
-
-// #define SINGLE
-// #define SINGLE_UNROLLED
-
-// #define PARALLEL
-// #define PARALLEL_UNROLLED
-
-// #define PARALLEL_LOCAL
-// #define LOCAL_UNROLLED
-
-// #define PARALLEL_RED0
-// #define PARALLEL_UNROLLED_RED0
-
-// #define PARALLEL_REDTREE
-// #define PARALLEL_UNROLLED_REDTREE
-
-//////////////////////////////////
-
-// Vectors for kernel computation
-int32_t vector_a[LEN] __attribute__((aligned(LEN), section(".l1")));
-int32_t vector_b[LEN] __attribute__((aligned(LEN), section(".l1")));
-
-#if defined(PARALLEL_RED0) || defined(PARALLEL_UNROLLED_RED0) ||               \
-    defined(PARALLEL_REDTREE) || defined(PARALLEL_UNROLLED_REDTREE)
-int32_t sum[N_BANK] __attribute__((aligned(N_BANK), section(".l1")));
-#else
-int32_t sum __attribute__((section(".l1")));
-#endif
-
-// Vectors for performance metrics
-uint32_t volatile red_barrier[NUM_CORES * 4]
-    __attribute__((aligned(NUM_CORES * 4), section(".l1")));
-int32_t result __attribute__((section(".l1")));
-int32_t check __attribute__((section(".l1")));
-int volatile error __attribute__((section(".l1")));
diff --git a/software/apps/baremetal/dotp_i32/dotp_parallel.h b/software/apps/baremetal/dotp_i32/dotp_parallel.h
deleted file mode 100644
index b765f6987..000000000
--- a/software/apps/baremetal/dotp_i32/dotp_parallel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-/* Parallel dot-product */
-void dotp_parallel(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
-                   uint32_t nPE) {
-
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  uint32_t step = Len / nPE;
-
-  register int32_t local_sum = 0;
-  register int32_t a, b;
-  for (uint32_t i = core_id * step; i < core_id * step + step; i++) {
-    a = in_a[i];
-    b = in_b[i];
-    local_sum += a * b;
-  }
-  mempool_stop_benchmark();
-  mempool_start_benchmark();
-  __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED);
-#ifdef LOG_BARRIERS
-  mempool_log_barrier(2, core_id);
-  (void)num_cores;
-#else
-  mempool_barrier(num_cores);
-#endif
-}
-
-/* Parallel dot-product */
-void dotp_parallel_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
-                             uint32_t Len, uint32_t nPE) {
-
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  uint32_t step = Len / nPE;
-  uint32_t reminder = step % 4;
-  uint32_t i;
-
-  register int32_t a0 = 0, b0 = 0, a1 = 0, b1 = 0, a2 = 0, b2 = 0, a3 = 0,
-                   b3 = 0;
-  register int32_t local_sum0 = 0;
-  register int32_t local_sum1 = 0;
-  register int32_t local_sum2 = 0;
-  register int32_t local_sum3 = 0;
-  for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) {
-    a0 = in_a[i];
-    b0 = in_b[i];
-    a1 = in_a[i + 1];
-    b1 = in_b[i + 1];
-    a2 = in_a[i + 2];
-    b2 = in_b[i + 2];
-    a3 = in_a[i + 3];
-    b3 = in_b[i + 3];
-    local_sum0 += a0 * b0;
-    local_sum1 += a1 * b1;
-    local_sum2 += a2 * b2;
-    local_sum3 += a3 * b3;
-  }
-  i = core_id * step + step - reminder;
-  while (i < step) {
-    a0 = in_a[i];
-    b0 = in_b[i];
-    local_sum0 += a0 * b0;
-    i++;
-  }
-  local_sum0 += local_sum1;
-  local_sum2 += local_sum3;
-  local_sum0 += local_sum2;
-  mempool_barrier(num_cores);
-
-  mempool_stop_benchmark();
-  mempool_start_benchmark();
-  __atomic_fetch_add(&s[0], local_sum0, __ATOMIC_RELAXED);
-#ifdef LOG_BARRIERS
-  mempool_log_barrier(2, core_id);
-#else
-  mempool_barrier(num_cores);
-#endif
-}
diff --git a/software/apps/baremetal/dotp_i32/dotp_parallel_local.h b/software/apps/baremetal/dotp_i32/dotp_parallel_local.h
deleted file mode 100644
index 950955832..000000000
--- a/software/apps/baremetal/dotp_i32/dotp_parallel_local.h
+++ /dev/null
@@ -1,162 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-/*
-  Parallel dot-product with final reduction performed by multiple cores
-  using atomic-fetch and adds to a single memory location.
-   A) Parallelized workload
-   B) Atomic fetch and add to a single memory location
-   C) Barrier */
-
-/*******************************************************/
-/**                    MULTI-CORE                     **/
-/*******************************************************/
-
-/* Parallel dot-product */
-void dotp_parallel_local(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
-                         uint32_t nPE) {
-
-  uint32_t const remainder = Len % 4;
-  uint32_t const idx_stop = Len - remainder;
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-
-  if (nPE == num_cores) {
-    register int32_t local_sum = 0;
-    uint32_t idx = core_id * 4;
-    while (idx < idx_stop) {
-      local_sum += in_a[idx] * in_b[idx];
-      local_sum += in_a[idx + 1] * in_b[idx + 1];
-      local_sum += in_a[idx + 2] * in_b[idx + 2];
-      local_sum += in_a[idx + 3] * in_b[idx + 3];
-      idx += N_BANK;
-    }
-    if (core_id == (Len % N_BANK) / 4) {
-      while (idx < Len) {
-        local_sum += in_a[idx] * in_b[idx];
-        idx++;
-      }
-    }
-    mempool_stop_benchmark();
-    mempool_start_benchmark();
-    __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED);
-#ifdef LOG_BARRIERS
-    mempool_log_barrier(2, core_id);
-#else
-    mempool_barrier(num_cores);
-#endif
-  } else {
-    register int32_t local_sum = 0;
-    uint32_t idx = core_id * 4;
-    while (idx < idx_stop) {
-      local_sum += in_a[idx] * in_b[idx];
-      local_sum += in_a[idx + 1] * in_b[idx + 1];
-      local_sum += in_a[idx + 2] * in_b[idx + 2];
-      local_sum += in_a[idx + 3] * in_b[idx + 3];
-      idx += N_BANK_PE;
-    }
-    if (core_id == (Len % N_BANK_PE) / 4) {
-      while (idx < Len) {
-        local_sum += in_a[idx] * in_b[idx];
-        idx++;
-      }
-    }
-    if (core_id < nPE) {
-      mempool_stop_benchmark();
-      mempool_start_benchmark();
-    }
-    __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED);
-#ifdef LOG_BARRIERS
-    mempool_log_partial_barrier(2, core_id, nPE);
-#else
-    mempool_barrier(num_cores);
-#endif
-  }
-}
-
-/* Parallel dot-product with loop unrolling */
-void dotp_parallel_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
-                                   uint32_t Len, uint32_t nPE) {
-
-  uint32_t const remainder = Len % 4;
-  uint32_t const idx_stop = Len - remainder;
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  register int32_t local_sum_1 = 0;
-  register int32_t local_sum_2 = 0;
-  register int32_t local_sum_3 = 0;
-  register int32_t local_sum_4 = 0;
-
-  if (nPE == num_cores) {
-    uint32_t idx = core_id * 4;
-    while (idx < idx_stop) {
-      int32_t in_a1 = in_a[idx];
-      int32_t in_b1 = in_b[idx];
-      int32_t in_a2 = in_a[idx + 1];
-      int32_t in_b2 = in_b[idx + 1];
-      int32_t in_a3 = in_a[idx + 2];
-      int32_t in_b3 = in_b[idx + 2];
-      int32_t in_a4 = in_a[idx + 3];
-      int32_t in_b4 = in_b[idx + 3];
-      local_sum_1 += in_a1 * in_b1;
-      local_sum_2 += in_a2 * in_b2;
-      local_sum_3 += in_a3 * in_b3;
-      local_sum_4 += in_a4 * in_b4;
-      idx += N_BANK;
-    }
-    if (core_id == ((Len % N_BANK) / 4)) {
-      while (idx < Len) {
-        local_sum_1 += in_a[idx] * in_b[idx];
-        idx++;
-      }
-    }
-    local_sum_1 += local_sum_2;
-    local_sum_3 += local_sum_4;
-    local_sum_1 += local_sum_3;
-    mempool_stop_benchmark();
-    mempool_start_benchmark();
-    __atomic_fetch_add(&s[0], local_sum_1, __ATOMIC_RELAXED);
-#ifdef LOG_BARRIERS
-    mempool_log_barrier(2, core_id);
-#else
-    mempool_barrier(num_cores);
-#endif
-  } else {
-    uint32_t idx = core_id * 4;
-    while (idx < idx_stop) {
-      int32_t in_a1 = in_a[idx];
-      int32_t in_b1 = in_b[idx];
-      int32_t in_a2 = in_a[idx + 1];
-      int32_t in_b2 = in_b[idx + 1];
-      int32_t in_a3 = in_a[idx + 2];
-      int32_t in_b3 = in_b[idx + 2];
-      int32_t in_a4 = in_a[idx + 3];
-      int32_t in_b4 = in_b[idx + 3];
-      local_sum_1 += in_a1 * in_b1;
-      local_sum_2 += in_a2 * in_b2;
-      local_sum_3 += in_a3 * in_b3;
-      local_sum_4 += in_a4 * in_b4;
-      idx += N_BANK_PE;
-    }
-    if (core_id == ((Len % N_BANK_PE) / 4)) {
-      while (idx < Len) {
-        local_sum_1 += in_a[idx] * in_b[idx];
-        idx++;
-      }
-    }
-    local_sum_1 += local_sum_2;
-    local_sum_3 += local_sum_4;
-    local_sum_1 += local_sum_3;
-    mempool_stop_benchmark();
-    mempool_start_benchmark();
-    __atomic_fetch_add(&s[0], local_sum_1, __ATOMIC_RELAXED);
-#ifdef LOG_BARRIERS
-    mempool_log_partial_barrier(2, core_id, nPE);
-#else
-    mempool_barrier(num_cores);
-#endif
-  }
-}
diff --git a/software/apps/baremetal/dotp_i32/dotp_parallel_red0.h b/software/apps/baremetal/dotp_i32/dotp_parallel_red0.h
deleted file mode 100644
index 0ad166d41..000000000
--- a/software/apps/baremetal/dotp_i32/dotp_parallel_red0.h
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-/*
-  Parallel dot-product with atomic fetch and add towards local memory
-  locations and final reduction by a single core. The cores write in
-  memory banks separated by a "step".
-    A) Parallelized workload
-    B) Atomic fetch and add to local memory banks
-    C) Barrier
-    D) Final reduction by core 0 incorporated in a barrier */
-
-/*******************************************************/
-/**                    MULTI-CORE                     **/
-/*******************************************************/
-
-/* Parallel dot-product */
-void dotp_parallel_red0(int32_t *in_a, int32_t *in_b, int32_t *s,
-                        uint32_t Len) {
-
-  uint32_t const remainder = Len % 4;
-  uint32_t const idx_stop = Len - remainder;
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  int32_t local_sum = 0;
-
-  uint32_t idx = core_id * 4;
-  while (idx < idx_stop) {
-    local_sum += in_a[idx] * in_b[idx];
-    local_sum += in_a[idx + 1] * in_b[idx + 1];
-    local_sum += in_a[idx + 2] * in_b[idx + 2];
-    local_sum += in_a[idx + 3] * in_b[idx + 3];
-    idx += N_BANK;
-  }
-  if (core_id == (Len % N_BANK) / 4) {
-    while (idx < Len) {
-      local_sum += in_a[idx] * in_b[idx];
-      idx++;
-    }
-  }
-  __atomic_fetch_add(&s[(core_id / STEP_CORES) * STEP], local_sum,
-                     __ATOMIC_RELAXED);
-  mempool_stop_benchmark();
-
-  mempool_start_benchmark();
-  if ((num_cores - 1) ==
-      __atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) {
-    __atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED);
-    __sync_synchronize(); // Full memory barrier
-    uint32_t idx_red = 0;
-    local_sum = 0;
-    while (idx_red < N_BANK) {
-      local_sum += s[idx_red];
-      idx_red += STEP;
-    }
-    s[0] = local_sum;
-    wake_up_all();
-  }
-  mempool_wfi();
-}
-
-/* Parallel dot-product with loop unrolling */
-void dotp_parallel_unrolled4_red0(int32_t *in_a, int32_t *in_b, int32_t *s,
-                                  uint32_t Len) {
-
-  uint32_t const remainder = Len % 4;
-  uint32_t const idx_stop = Len - remainder;
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  int32_t local_sum_1 = 0;
-  int32_t local_sum_2 = 0;
-  int32_t local_sum_3 = 0;
-  int32_t local_sum_4 = 0;
-
-  uint32_t idx = core_id * 4;
-  while (idx < idx_stop) {
-    int32_t in_a1 = in_a[idx];
-    int32_t in_b1 = in_b[idx];
-    int32_t in_a2 = in_a[idx + 1];
-    int32_t in_b2 = in_b[idx + 1];
-    int32_t in_a3 = in_a[idx + 2];
-    int32_t in_b3 = in_b[idx + 2];
-    int32_t in_a4 = in_a[idx + 3];
-    int32_t in_b4 = in_b[idx + 3];
-    local_sum_1 += in_a1 * in_b1;
-    local_sum_2 += in_a2 * in_b2;
-    local_sum_3 += in_a3 * in_b3;
-    local_sum_4 += in_a4 * in_b4;
-    idx += N_BANK;
-  }
-  if (core_id == ((Len % N_BANK) / 4)) {
-    while (idx < Len) {
-      local_sum_1 += in_a[idx] * in_b[idx];
-      idx++;
-    }
-  }
-  local_sum_1 += local_sum_2;
-  local_sum_3 += local_sum_4;
-  local_sum_1 += local_sum_3;
-  __atomic_fetch_add(&s[(core_id / STEP_CORES) * STEP], local_sum_1,
-                     __ATOMIC_RELAXED);
-  mempool_stop_benchmark();
-
-  mempool_start_benchmark();
-  if ((num_cores - 1) ==
-      __atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) {
-    __atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED);
-    __sync_synchronize(); // Full memory barrier
-    uint32_t idx_red = 0;
-    local_sum_1 = 0;
-    while (idx_red < N_BANK) {
-      local_sum_1 += s[idx_red];
-      idx_red += STEP;
-    }
-    s[0] = local_sum_1;
-    wake_up_all();
-  }
-  mempool_wfi();
-}
diff --git a/software/apps/baremetal/dotp_i32/dotp_parallel_redtree.h b/software/apps/baremetal/dotp_i32/dotp_parallel_redtree.h
deleted file mode 100644
index 3659de0a3..000000000
--- a/software/apps/baremetal/dotp_i32/dotp_parallel_redtree.h
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-/*
-  Parallel dot-product with atomic fetch and add towards local memory
-  locations and final reduction by a single core. The cores write in
-  memory banks separated by a "step".
-    A) Parallelized workload
-    B) Atomic fetch and add to local memory banks
-    C) Barrier
-    D) Final reduction by core 0 incorporated in a barrier */
-
-/*******************************************************/
-/**                    MULTI-CORE                     **/
-/*******************************************************/
-
-void mempool_log_reduction(int32_t *sum, uint32_t volatile step,
-                           uint32_t core_id);
-
-/* Parallel dot-product */
-void dotp_parallel_redtree(int32_t *in_a, int32_t *in_b, int32_t *s,
-                           uint32_t Len) {
-
-  uint32_t const remainder = Len % 4;
-  uint32_t const idx_stop = Len - remainder;
-  uint32_t core_id = mempool_get_core_id();
-
-  register int32_t local_sum = 0;
-  uint32_t idx = core_id * 4;
-  while (idx < idx_stop) {
-    local_sum += in_a[idx] * in_b[idx];
-    local_sum += in_a[idx + 1] * in_b[idx + 1];
-    local_sum += in_a[idx + 2] * in_b[idx + 2];
-    local_sum += in_a[idx + 3] * in_b[idx + 3];
-    idx += N_BANK;
-  }
-  if (core_id == (Len % N_BANK) / 4) {
-    while (idx < Len) {
-      local_sum += in_a[idx] * in_b[idx];
-      idx++;
-    }
-  }
-  s[core_id * 4] = local_sum; // Each core is storing locally
-  mempool_stop_benchmark();
-  mempool_start_benchmark();
-  mempool_log_reduction(s, 2, core_id);
-}
-
-void dotp_parallel_redtree_unrolled(int32_t *in_a, int32_t *in_b, int32_t *s,
-                                    uint32_t Len) {
-
-  uint32_t const remainder = Len % 4;
-  uint32_t const idx_stop = Len - remainder;
-  uint32_t core_id = mempool_get_core_id();
-  register int32_t local_sum_1 = 0;
-  register int32_t local_sum_2 = 0;
-  register int32_t local_sum_3 = 0;
-  register int32_t local_sum_4 = 0;
-
-  uint32_t idx = core_id * 4;
-  while (idx < idx_stop) {
-    int32_t in_a1 = in_a[idx];
-    int32_t in_b1 = in_b[idx];
-    int32_t in_a2 = in_a[idx + 1];
-    int32_t in_b2 = in_b[idx + 1];
-    int32_t in_a3 = in_a[idx + 2];
-    int32_t in_b3 = in_b[idx + 2];
-    int32_t in_a4 = in_a[idx + 3];
-    int32_t in_b4 = in_b[idx + 3];
-    local_sum_1 += in_a1 * in_b1;
-    local_sum_2 += in_a2 * in_b2;
-    local_sum_3 += in_a3 * in_b3;
-    local_sum_4 += in_a4 * in_b4;
-    idx += N_BANK;
-  }
-  if (core_id == ((Len % N_BANK) / 4)) {
-    while (idx < Len) {
-      local_sum_1 += in_a[idx] * in_b[idx];
-      idx++;
-    }
-  }
-  local_sum_1 += local_sum_2;
-  local_sum_3 += local_sum_4;
-  local_sum_1 += local_sum_3;
-  s[core_id * 4] = local_sum_1; // Each core is storing locally
-  mempool_stop_benchmark();
-  mempool_start_benchmark();
-  mempool_log_reduction(s, 2, core_id);
-}
-
-void mempool_log_reduction(int32_t *sum, uint32_t volatile step,
-                           uint32_t core_id) {
-
-  uint32_t idx_sum, idx = (step * (core_id / step)) * 4;
-  uint32_t next_step, previous_step;
-  register int32_t local_sum;
-  uint32_t num_cores = mempool_get_core_count();
-
-  previous_step = step >> 1;
-  if ((step - previous_step) ==
-      __atomic_fetch_add(&red_barrier[idx + previous_step - 1], previous_step,
-                         __ATOMIC_RELAXED)) {
-
-    local_sum = 0;
-    idx_sum = idx;
-    while (idx_sum < idx + step * 4) {
-      local_sum += sum[idx_sum];
-      idx_sum += previous_step * 4;
-    }
-    sum[idx] = local_sum;
-
-    next_step = step << 1;
-    __atomic_store_n(&red_barrier[idx + previous_step - 1], 0,
-                     __ATOMIC_RELAXED);
-    if (num_cores == step) {
-      sum[0] = sum[idx];
-      __sync_synchronize(); // Full memory barrier
-      wake_up_all();
-      mempool_wfi();
-    } else {
-      mempool_log_reduction(sum, next_step, core_id);
-    }
-
-  } else
-    mempool_wfi();
-}
diff --git a/software/apps/baremetal/dotp_i32/main.c b/software/apps/baremetal/dotp_i32/main.c
index f7cf7508f..ddc1ef141 100644
--- a/software/apps/baremetal/dotp_i32/main.c
+++ b/software/apps/baremetal/dotp_i32/main.c
@@ -8,132 +8,72 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "dma.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
 
-#include "define.h"
+#include "data_dotp_i32.h"
+#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
+#define LOG_BARRIERS
+// #define ATOMIC_REDUCTION
+// #define SINGLE_CORE_REDUCTION
+#define BINARY_REDUCTION
 
-#include "dotp_parallel.h"
-#include "dotp_parallel_local.h"
-#include "dotp_parallel_red0.h"
-#include "dotp_parallel_redtree.h"
-#include "dotp_single.h"
+// Vectors for kernel computation
+int32_t l1_X[array_N] __attribute__((aligned(array_N), section(".l1_prio")));
+int32_t l1_Y[array_N] __attribute__((aligned(array_N), section(".l1_prio")));
+uint32_t red_barrier[NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+int32_t sum[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 
-void init_vectors(int32_t *in_a, int32_t *in_b, int32_t *s, int32_t *p_result,
-                  int32_t *p_check, uint32_t Len) {
-  *p_result = 0;
-  *p_check = 0;
-  uint32_t j = 0;
-  uint32_t num_cores = mempool_get_core_count();
-  while (j < Len) {
-    int32_t a = (int32_t)(j % num_cores);
-    int32_t b = (int32_t)(j % 4 + 3);
-    in_a[j] = a;
-    in_b[j] = b;
-    *p_check = *p_check + (int32_t)(a * b);
-    j++;
-  }
-#if defined(PARALLEL_RED0) || defined(PARALLEL_UNROLLED_RED0) ||               \
-    defined(PARALLEL_REDTREE) || defined(PARALLEL_UNROLLED_REDTREE)
-  for (uint32_t k = 0; k < N_BANK; k++) {
-    s[k] = 0;
-    red_barrier[k] = 0;
-  }
-#else
-  *s = 0;
-#endif
-}
+#include "baremetal/mempool_dotp_i32p.h"
+#include "baremetal/mempool_dotp_i32s.h"
 
 int main() {
 
   uint32_t core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
   uint32_t time_init, time_end;
-  // initialize synchronization variables
   mempool_barrier_init(core_id);
 
+  time_init = 0;
+  time_end = 0;
   if (core_id == 0) {
-    error = 0;
-    time_init = 0;
-    time_end = 0;
-#if defined(PARALLEL_RED0) || defined(PARALLEL_UNROLLED_RED0) ||               \
-    defined(PARALLEL_REDTREE) || defined(PARALLEL_UNROLLED_REDTREE)
-    init_vectors(vector_a, vector_b, sum, &result, &check, LEN);
-#else
-    init_vectors(vector_a, vector_b, &sum, &result, &check, LEN);
-#endif
+    dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int32_t));
+    dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int32_t));
+  }
+  for (uint32_t k = core_id; k < NUM_BANKS; k += num_cores) {
+    sum[k] = 0;
+    red_barrier[k] = 0;
   }
-  mempool_barrier(num_cores); // wait until all cores have finished
+  mempool_barrier(num_cores);
 
-  // Kernel execution
+  //  // SINGLE-CORE
+  //  time_init = mempool_get_timer();
+  //  dotp_i32s_unrolled4(l1_A, l1_B, sum, array_N);
+  //  time_end = mempool_get_timer();
 
-  time_init = mempool_get_timer();
-#ifdef SINGLE
-  dotp_single(vector_a, vector_b, &sum, LEN);
-#elif defined(SINGLE_UNROLLED)
-  dotp_single_unrolled4(vector_a, vector_b, &sum, LEN);
-#endif
-  time_end = mempool_get_timer();
+  //  // PARALLEL
+  //  time_init = mempool_get_timer();
+  //  dotp_i32p(l1_A, l1_B, sum, array_N, num_cores);
+  //  time_end = mempool_get_timer();
 
+  // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
-  mempool_start_benchmark();
-/* A) Parallelized workload
-   B) Atomic fetch and add to a single memory location
-   C) Barrier */
-#ifdef PARALLEL
-  dotp_parallel(vector_a, vector_b, &sum, LEN, N_PE);
-#elif defined(PARALLEL_UNROLLED)
-  dotp_parallel_unrolled4(vector_a, vector_b, &sum, LEN, N_PE);
-/* A) Parallelized workload
-   B) Atomic fetch and add to local memory banks
-   C) Barrier
-   D) Final reduction by core 0 incorporated in a barrier */
-#elif defined(PARALLEL_RED0)
-  dotp_parallel_red0(vector_a, vector_b, sum, LEN, N_PE);
-#elif defined(PARALLEL_UNROLLED_RED0)
-  dotp_parallel_unrolled4_red0(vector_a, vector_b, sum, LEN, N_PE);
-/* A) Parallelized workload
-   B) Nested set of barriers: reduction is performed in a logarithmic tree. */
-#elif defined(PARALLEL_REDTREE)
-  dotp_parallel_redtree(vector_a, vector_b, sum, LEN, N_PE);
-#elif defined(PARALLEL_UNROLLED_REDTREE)
-  dotp_parallel_redtree_unrolled(vector_a, vector_b, sum, LEN, N_PE);
-#endif
-  mempool_stop_benchmark();
+  dotp_i32p_local_unrolled4(l1_X, l1_Y, sum, array_N);
   time_end = mempool_get_timer();
 
-  /* A) Parallelized workload
-     B) Atomic fetch and add to a single memory location
-     C) Barrier */
-  if (core_id < N_PE) {
-    time_init = mempool_get_timer();
-    mempool_start_benchmark();
-#ifdef PARALLEL_LOCAL
-    dotp_parallel_local(vector_a, vector_b, &sum, LEN, N_PE);
-#elif defined(LOCAL_UNROLLED)
-    dotp_parallel_local_unrolled4(vector_a, vector_b, &sum, LEN, N_PE);
-#endif
-    mempool_stop_benchmark();
-    time_end = mempool_get_timer();
-  }
-
-  mempool_barrier(num_cores);
   // Check results
+  mempool_barrier(num_cores);
   if (core_id == 0) {
     uint32_t clock_cycles = (time_end - time_init);
-#if defined(PARALLEL_RED0) || defined(PARALLEL_UNROLLED_RED0) ||               \
-    defined(PARALLEL_REDTREE) || defined(PARALLEL_UNROLLED_REDTREE)
-    result = sum[0];
-#else
-    result = sum;
-#endif
     printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
-    printf("Result ==> %d\n", result);
-    printf("Check  ==> %d\n\n", check);
+    printf("Result ==> %d\n", sum[0]);
+    printf("Check  ==> %d\n\n", l2_Z);
   }
   mempool_barrier(num_cores);
 
-  return error;
+  return 0;
 }
diff --git a/software/apps/baremetal/matmul_f16/main.c b/software/apps/baremetal/matmul_f16/main.c
index b3b474b1d..99a0269cc 100644
--- a/software/apps/baremetal/matmul_f16/main.c
+++ b/software/apps/baremetal/matmul_f16/main.c
@@ -34,8 +34,10 @@ int main() {
 
   // Initialize Matrices 1
   if (core_id == 0) {
-    dma_memcpy_blocking(matrix_a, A, (matrix_M * matrix_N) * sizeof(int16_t));
-    dma_memcpy_blocking(matrix_b, B, (matrix_N * matrix_P) * sizeof(int16_t));
+    dma_memcpy_blocking(matrix_a, l2_A,
+                        (matrix_M * matrix_N) * sizeof(int16_t));
+    dma_memcpy_blocking(matrix_b, l2_B,
+                        (matrix_N * matrix_P) * sizeof(int16_t));
   }
   mempool_barrier(num_cores);
 
@@ -59,7 +61,7 @@ int main() {
   mempool_stop_benchmark();
 #endif
 
-  mempool_check_f16(matrix_c, C, matrix_M * matrix_P, 0.5f, 0);
+  mempool_check_f16(matrix_c, l2_C, matrix_M * matrix_P, 0.5f, 0);
   mempool_barrier(num_cores);
   return 0;
 }
diff --git a/software/apps/baremetal/matmul_f32/main.c b/software/apps/baremetal/matmul_f32/main.c
index bc391200f..d3d7622db 100644
--- a/software/apps/baremetal/matmul_f32/main.c
+++ b/software/apps/baremetal/matmul_f32/main.c
@@ -30,13 +30,14 @@ int main() {
   uint32_t num_cores = mempool_get_core_count();
   mempool_barrier_init(core_id);
 
-  // Initialize Matrices
+  // Initialize data
   if (core_id == 0) {
-    dma_memcpy_blocking(matrix_a, A, matrix_M * matrix_N * sizeof(int32_t));
-    dma_memcpy_blocking(matrix_b, B, matrix_N * matrix_P * sizeof(int32_t));
+    dma_memcpy_blocking(matrix_a, l2_A, matrix_M * matrix_N * sizeof(int32_t));
+    dma_memcpy_blocking(matrix_b, l2_B, matrix_N * matrix_P * sizeof(int32_t));
   }
   mempool_barrier(num_cores);
 
+  // Benchmark
 #if defined(SINGLE)
   if (core_id == 0) {
     // Execute function to test.
@@ -57,7 +58,7 @@ int main() {
   mempool_stop_benchmark();
 #endif
 
-  mempool_check_f32(matrix_c, C, matrix_M * matrix_P, 0.01f, 0);
+  mempool_check_f32(matrix_c, l2_C, matrix_M * matrix_P, 0.01f, 0);
   mempool_barrier(num_cores);
   return 0;
 }
diff --git a/software/apps/baremetal/matmul_i16/main.c b/software/apps/baremetal/matmul_i16/main.c
index 5fe981858..a2b554dfa 100644
--- a/software/apps/baremetal/matmul_i16/main.c
+++ b/software/apps/baremetal/matmul_i16/main.c
@@ -7,135 +7,46 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "baremetal/mempool_matmul_i16p.h"
+#include "dma.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
 
-// Define Matrix dimensions:
-// C = AB with A=[MxN], B=[NxP], C=[MxP]
-#define matrix_M 64
-#define matrix_N 64
-#define matrix_P 64
-
-int16_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
-int16_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
-int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio")));
+#include "baremetal/mempool_checks.h"
+#include "baremetal/mempool_matmul_i16p.h"
+#include "data_matmul_i16.h"
 
-int volatile error __attribute__((section(".l1")));
+int16_t l1_A[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
+int16_t l1_B[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
+int32_t l1_C[matrix_M * matrix_P] __attribute__((section(".l1_prio")));
 
-void init_matrix(int16_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                 int16_t a, int16_t b, int16_t c, uint32_t core_id,
-                 uint32_t num_cores) {
-  uint32_t const split = 8; // How many rows/columns to split the matrix into
-  if (num_columns > num_rows) {
-    // Parallelize over columns
-    uint32_t const c_start = (num_rows / split) * (core_id % split);
-    uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1);
-    for (uint32_t j = (core_id / split); j < num_columns;
-         j += (num_cores / split)) {
-      for (uint32_t i = c_start; i < c_end; ++i) {
-        matrix[i * num_columns + j] =
-            (int16_t)(a * (int16_t)i + b * (int16_t)j + c);
-      }
-    }
-  } else {
-    // Parallelize over rows
-    uint32_t const c_start = (num_columns / split) * (core_id % split);
-    uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1);
-    for (uint32_t i = (core_id / split); i < num_rows;
-         i += (num_cores / split)) {
-      for (uint32_t j = c_start; j < c_end; ++j) {
-        matrix[i * num_columns + j] =
-            (int16_t)(a * (int16_t)i + b * (int16_t)j + c);
-      }
-    }
-  }
-}
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier_init(core_id);
 
-// Initialize the matrices in parallel
-int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                  uint32_t inner_dim, int16_t aa, int16_t ab, int16_t ac,
-                  int16_t ba, int16_t bb, int16_t bc, uint32_t core_id,
-                  uint32_t num_cores) {
-  // Convert to signed
-  int32_t n = (int32_t)inner_dim;
-  // Parallelize over rows
-  for (uint32_t i = core_id; i < num_rows; i += num_cores) {
-    for (uint32_t j = 0; j < num_columns; ++j) {
-      int32_t ii = (int32_t)i;
-      int32_t jj = (int32_t)j;
-      int32_t lin = ((int32_t)aa * bb * ii * jj + aa * bc * ii + ac * bb * jj +
-                     (int32_t)ac * bc) *
-                    n;
-      int32_t qua =
-          (((int32_t)aa * ba * ii + ab * bb * jj + ab * bc + (int32_t)ba * ac) *
-           (n * (n - 1))) /
-          2;
-      int32_t cub = (((int32_t)ab * ba) * (n * (n - 1) * (2 * n - 1))) / 6;
-      int32_t golden = lin + qua + cub;
-      if (matrix[i * num_columns + j] != golden) {
-        return (i + j) == 0 ? -1 : (int)(i * num_columns + j);
-      }
-      matrix[i * num_columns + j] = 0;
-    }
+  // Initialize data
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_A, l2_A, matrix_M * matrix_N * sizeof(int16_t));
+    dma_memcpy_blocking(l1_B, l2_B, matrix_N * matrix_P * sizeof(int16_t));
   }
-  return 0;
-}
-
-int test_matrix_multiplication(int16_t *__restrict__ A, int16_t *__restrict__ B,
-                               int32_t *__restrict__ C, uint32_t M, uint32_t N,
-                               uint32_t P, uint32_t core_id,
-                               uint32_t num_cores) {
-  int16_t const A_a = 1;
-  int16_t const A_b = 1;
-  int16_t const A_c = -40;
-  int16_t const B_a = 0;
-  int16_t const B_b = 1;
-  int16_t const B_c = 19;
-
-  // Initialize Matrices
-  init_matrix(A, M, N, A_a, A_b, A_c, core_id, num_cores);
-  init_matrix(B, N, P, B_a, B_b, B_c, core_id, num_cores);
-  // Wait at barrier until everyone is ready
   mempool_barrier(num_cores);
-  // Execute function to test.
-  mempool_start_benchmark();
 
+  // Benchmark
+  mempool_start_benchmark();
 #ifdef __XPULPIMG
-  matmul_unrolled_4x2_pincr_asm_parallel_i16_xpulpv2(A, B, C, M, N, P, core_id,
-                                                     num_cores);
+  matmul_unrolled_4x2_pincr_asm_parallel_i16_xpulpv2(
+      l1_A, l1_B, l1_C, matrix_M, matrix_N, matrix_P, core_id, num_cores);
 #else
-  matmul_unrolled_2x2_parallel_i16_rv32im(A, B, C, M, N, P, core_id, num_cores);
+  matmul_unrolled_2x2_parallel_i16_rv32im(l1_A, l1_B, l1_C, matrix_M, matrix_N,
+                                          matrix_P, core_id, num_cores);
 #endif
-
   mempool_stop_benchmark();
-  // Wait at barrier befor checking
   mempool_barrier(num_cores);
-  if (verify_matrix(C, M, P, N, A_a, A_b, A_c, B_a, B_b, B_c, core_id,
-                    num_cores)) {
-    error = 1;
-    return -1;
-  }
-  return 0;
-}
 
-int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  // Initialize barrier and synchronize
-  mempool_barrier_init(core_id);
-
-  if (core_id == 0) {
-    error = 0;
-  }
-
-  // Test the Matrix multiplication
-  test_matrix_multiplication(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N,
-                             matrix_P, core_id, num_cores);
-  // wait until all cores have finished
+  // Verify results
+  mempool_check_i32(l1_C, l2_C, matrix_M * matrix_P, 0, 0);
   mempool_barrier(num_cores);
-
-  return error;
+  return 0;
 }
diff --git a/software/apps/baremetal/matmul_i32/main.c b/software/apps/baremetal/matmul_i32/main.c
index 65e2b82f1..3713dcabe 100644
--- a/software/apps/baremetal/matmul_i32/main.c
+++ b/software/apps/baremetal/matmul_i32/main.c
@@ -7,131 +7,46 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "baremetal/mempool_matmul_i32p.h"
+#include "dma.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
 
-// Define Matrix dimensions:
-// C = AB with A=[MxN], B=[NxP], C=[MxP]
-#define matrix_M 64
-#define matrix_N 32
-#define matrix_P 64
-
-int32_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
-int32_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
-int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio")));
+#include "baremetal/mempool_checks.h"
+#include "baremetal/mempool_matmul_i32p.h"
+#include "data_matmul_i32.h"
 
-int volatile error __attribute__((section(".l1")));
+int32_t l1_A[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
+int32_t l1_B[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
+int32_t l1_C[matrix_M * matrix_P] __attribute__((section(".l1_prio")));
 
-void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                 int32_t a, int32_t b, int32_t c, uint32_t core_id,
-                 uint32_t num_cores) {
-  uint32_t const split = 8; // How many rows/columns to split the matrix into
-  if (num_columns > num_rows) {
-    // Parallelize over columns
-    uint32_t const c_start = (num_rows / split) * (core_id % split);
-    uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1);
-    for (uint32_t j = (core_id / split); j < num_columns;
-         j += (num_cores / split)) {
-      for (uint32_t i = c_start; i < c_end; ++i) {
-        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
-      }
-    }
-  } else {
-    // Parallelize over rows
-    uint32_t const c_start = (num_columns / split) * (core_id % split);
-    uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1);
-    for (uint32_t i = (core_id / split); i < num_rows;
-         i += (num_cores / split)) {
-      for (uint32_t j = c_start; j < c_end; ++j) {
-        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
-      }
-    }
-  }
-}
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier_init(core_id);
 
-// Initialize the matrices in parallel
-int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                  uint32_t inner_dim, int32_t aa, int32_t ab, int32_t ac,
-                  int32_t ba, int32_t bb, int32_t bc, uint32_t core_id,
-                  uint32_t num_cores) {
-  // Convert to signed
-  int32_t n = (int32_t)inner_dim;
-  // Parallelize over rows
-  for (uint32_t i = core_id; i < num_rows; i += num_cores) {
-    for (uint32_t j = 0; j < num_columns; ++j) {
-      int32_t ii = (int32_t)i;
-      int32_t jj = (int32_t)j;
-      int32_t lin =
-          (aa * bb * ii * jj + aa * bc * ii + ac * bb * jj + ac * bc) * n;
-      int32_t qua =
-          ((aa * ba * ii + ab * bb * jj + ab * bc + ba * ac) * (n * (n - 1))) /
-          2;
-      int32_t cub = ((ab * ba) * (n * (n - 1) * (2 * n - 1))) / 6;
-      int32_t golden = lin + qua + cub;
-      if (matrix[i * num_columns + j] != golden) {
-        return (i + j) == 0 ? -1 : (int)(i * num_columns + j);
-      }
-      matrix[i * num_columns + j] = 0;
-    }
+  // Initialize data
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_A, l2_A, matrix_M * matrix_N * sizeof(int32_t));
+    dma_memcpy_blocking(l1_B, l2_B, matrix_N * matrix_P * sizeof(int32_t));
   }
-  return 0;
-}
-
-int test_matrix_multiplication(int32_t *__restrict__ A, int32_t *__restrict__ B,
-                               int32_t *__restrict__ C, uint32_t M, uint32_t N,
-                               uint32_t P, uint32_t core_id,
-                               uint32_t num_cores) {
-  int32_t const A_a = 1;
-  int32_t const A_b = 1;
-  int32_t const A_c = -32;
-  int32_t const B_a = 2;
-  int32_t const B_b = 1;
-  int32_t const B_c = 16;
-
-  // Initialize Matrices
-  init_matrix(A, M, N, A_a, A_b, A_c, core_id, num_cores);
-  init_matrix(B, N, P, B_a, B_b, B_c, core_id, num_cores);
-  // Wait at barrier until everyone is ready
   mempool_barrier(num_cores);
-  // Execute function to test.
-  mempool_start_benchmark();
 
+  // Benchmark
+  mempool_start_benchmark();
 #ifdef __XPULPIMG
-  matmul_unrolled_2x2_parallel_i32_xpulpv2(A, B, C, M, N, P, core_id,
-                                           num_cores);
+  matmul_unrolled_2x2_parallel_i32_xpulpv2(l1_A, l1_B, l1_C, matrix_M, matrix_N,
+                                           matrix_P, core_id, num_cores);
 #else
-  matmul_unrolled_2x2_parallel_i32_rv32im(A, B, C, M, N, P, core_id, num_cores);
+  matmul_unrolled_2x2_parallel_i32_rv32im(l1_A, l1_B, l1_C, matrix_M, matrix_N,
+                                          matrix_P, core_id, num_cores);
 #endif
-
   mempool_stop_benchmark();
-  // Wait at barrier befor checking
   mempool_barrier(num_cores);
-  if (verify_matrix(C, M, P, N, A_a, A_b, A_c, B_a, B_b, B_c, core_id,
-                    num_cores)) {
-    error = 1;
-    return -1;
-  }
-  return 0;
-}
 
-int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  // Initialize barrier and synchronize
-  mempool_barrier_init(core_id);
-
-  if (core_id == 0) {
-    error = 0;
-  }
-
-  // Test the Matrix multiplication
-  test_matrix_multiplication(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N,
-                             matrix_P, core_id, num_cores);
-  // wait until all cores have finished
+  // Verify results
+  mempool_check_i32(l1_C, l2_C, matrix_M * matrix_P, 0, 0);
   mempool_barrier(num_cores);
-
-  return error;
+  return 0;
 }
diff --git a/software/apps/baremetal/matmul_i8/main.c b/software/apps/baremetal/matmul_i8/main.c
index 4fb557f2c..3aa99a4e6 100644
--- a/software/apps/baremetal/matmul_i8/main.c
+++ b/software/apps/baremetal/matmul_i8/main.c
@@ -7,137 +7,46 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "baremetal/mempool_matmul_i8p.h"
+#include "dma.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
 
-// Define Matrix dimensions:
-// C = AB with A=[MxN], B=[NxP], C=[MxP]
-#define matrix_M 64
-#define matrix_N 64
-#define matrix_P 64
-
-int8_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
-int8_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
-int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio")));
+#include "baremetal/mempool_checks.h"
+#include "baremetal/mempool_matmul_i8p.h"
+#include "data_matmul_i8.h"
 
-int volatile error __attribute__((section(".l1")));
+int8_t l1_A[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
+int8_t l1_B[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
+int32_t l1_C[matrix_M * matrix_P] __attribute__((section(".l1_prio")));
 
-void init_matrix(int8_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                 int8_t a, int8_t b, int8_t c, uint32_t core_id,
-                 uint32_t num_cores) {
-  uint32_t const split = 8; // How many rows/columns to split the matrix into
-  if (num_columns > num_rows) {
-    // Parallelize over columns
-    uint32_t const c_start = (num_rows / split) * (core_id % split);
-    uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1);
-    for (uint32_t j = (core_id / split); j < num_columns;
-         j += (num_cores / split)) {
-      for (uint32_t i = c_start; i < c_end; ++i) {
-        matrix[i * num_columns + j] =
-            (int8_t)(a * (int8_t)i + b * (int8_t)j + c);
-      }
-    }
-  } else {
-    // Parallelize over rows
-    uint32_t const c_start = (num_columns / split) * (core_id % split);
-    uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1);
-    for (uint32_t i = (core_id / split); i < num_rows;
-         i += (num_cores / split)) {
-      for (uint32_t j = c_start; j < c_end; ++j) {
-        matrix[i * num_columns + j] =
-            (int8_t)(a * (int8_t)i + b * (int8_t)j + c);
-      }
-    }
-  }
-}
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier_init(core_id);
 
-// Initialize the matrices in parallel
-int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                  uint32_t inner_dim, int8_t aa, int8_t ab, int8_t ac,
-                  int8_t ba, int8_t bb, int8_t bc, uint32_t core_id,
-                  uint32_t num_cores) {
-  // Convert to signed
-  int32_t n = (int32_t)inner_dim;
-  // Parallelize over rows
-  for (uint32_t i = core_id; i < num_rows; i += num_cores) {
-    for (uint32_t j = 0; j < num_columns; ++j) {
-      int32_t ii = (int32_t)i;
-      int32_t jj = (int32_t)j;
-      int32_t lin = ((int32_t)aa * bb * ii * jj + aa * bc * ii + ac * bb * jj +
-                     (int32_t)ac * bc) *
-                    n;
-      int32_t qua =
-          (((int32_t)aa * ba * ii + ab * bb * jj + ab * bc + (int32_t)ba * ac) *
-           (n * (n - 1))) /
-          2;
-      int32_t cub = (((int32_t)ab * ba) * (n * (n - 1) * (2 * n - 1))) / 6;
-      int32_t golden = lin + qua + cub;
-      if (matrix[i * num_columns + j] != golden) {
-        return (i + j) == 0 ? -1 : (int)(i * num_columns + j);
-      }
-      matrix[i * num_columns + j] = 0;
-    }
+  // Initialize data
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_A, l2_A, matrix_M * matrix_N * sizeof(int8_t));
+    dma_memcpy_blocking(l1_B, l2_B, matrix_N * matrix_P * sizeof(int8_t));
   }
-  return 0;
-}
-
-int test_matrix_multiplication(int8_t *__restrict__ A, int8_t *__restrict__ B,
-                               int32_t *__restrict__ C, uint32_t M, uint32_t N,
-                               uint32_t P, uint32_t core_id,
-                               uint32_t num_cores) {
-  int8_t const A_a = 1;
-  int8_t const A_b = 1;
-  int8_t const A_c = -40;
-  int8_t const B_a = 0;
-  int8_t const B_b = 1;
-  int8_t const B_c = 19;
-
-  // Initialize Matrices
-  init_matrix(A, M, N, A_a, A_b, A_c, core_id, num_cores);
-  init_matrix(B, N, P, B_a, B_b, B_c, core_id, num_cores);
-  // Wait at barrier until everyone is ready
   mempool_barrier(num_cores);
-  // Execute function to test.
-  mempool_start_benchmark();
 
+  // Benchmark
+  mempool_start_benchmark();
 #ifdef __XPULPIMG
-  matmul_unrolled_2x4_pincr_asm_parallel_i8_xpulpv2(A, B, C, M, N, P, core_id,
-                                                    num_cores);
-  // matmul_unrolled_2x4_parallel_i8_xpulpv2(A, B, C, M, N, P, core_id,
-  // num_cores);
+  matmul_unrolled_2x4_pincr_asm_parallel_i8_xpulpv2(
+      l1_A, l1_B, l1_C, matrix_M, matrix_N, matrix_P, core_id, num_cores);
 #else
-  matmul_unrolled_2x2_parallel_i8_rv32im(A, B, C, M, N, P, core_id, num_cores);
+  matmul_unrolled_2x2_parallel_i8_rv32im(l1_A, l1_B, l1_C, matrix_M, matrix_N,
+                                         matrix_P, core_id, num_cores);
 #endif
-
   mempool_stop_benchmark();
-  // Wait at barrier befor checking
   mempool_barrier(num_cores);
-  if (verify_matrix(C, M, P, N, A_a, A_b, A_c, B_a, B_b, B_c, core_id,
-                    num_cores)) {
-    error = 1;
-    return -1;
-  }
-  return 0;
-}
 
-int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  // Initialize barrier and synchronize
-  mempool_barrier_init(core_id);
-
-  if (core_id == 0) {
-    error = 0;
-  }
-
-  // Test the Matrix multiplication
-  test_matrix_multiplication(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N,
-                             matrix_P, core_id, num_cores);
-  // wait until all cores have finished
+  // Verify results
+  mempool_check_i32(l1_C, l2_C, matrix_M * matrix_P, 0, 0);
   mempool_barrier(num_cores);
-
-  return error;
+  return 0;
 }
diff --git a/software/apps/systolic/Makefile b/software/apps/systolic/Makefile
index 525b4b017..93e960434 100644
--- a/software/apps/systolic/Makefile
+++ b/software/apps/systolic/Makefile
@@ -14,8 +14,6 @@ RUNTIME_DIR := $(abspath $(SOFTWARE_DIR)/runtime)
 include $(RUNTIME_DIR)/runtime.mk
 
 APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find $(APPS_DIR) -name "main.c"))
-DATA := $(patsubst %.args,%.h,$(shell find $(APPS_DIR) -name "data.args"))
-ALLPYS := $(patsubst %.py,%.h,$(wildcard $(DATA_DIR)/*.py))
 BINARIES := $(addprefix $(BIN_DIR)/,$(APPS))
 
 # Define the rule to build all applications
@@ -26,7 +24,7 @@ $(APPS): % : $(BIN_DIR)/% $(APPS_DIR)/Makefile $(shell find $(RUNTIME_DIR)/**.{S
 # Check if the config is set to systolic
 ifeq ($(config),systolic)
 .PHONY: $(BINARIES)
-$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) $(DATA) $(ALLPYS) update_opcodes
+$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) data_%.h update_opcodes
 	mkdir -p $(dir $@)
 	$(RISCV_CC) -Iinclude $(RISCV_LDFLAGS) -o $@ $< $(RUNTIME) -T$(RUNTIME_DIR)/link.ld
 	$(RISCV_OBJDUMP) $(RISCV_OBJDUMP_FLAGS) -D $@ > $@.dump
diff --git a/software/data/README.md b/software/data/README.md
new file mode 100644
index 000000000..9fdab87cf
--- /dev/null
+++ b/software/data/README.md
@@ -0,0 +1,29 @@
+# Data Generation
+
+Data for mempool applications is generated with the `gendata_header.py` script.
+The `gendatalib.py` libaries generate random inputs and a reference golden model for the applications under test.
+The application parameters are passed to the script with the `gendata_params.hjson` file.
+
+An example entry follows: `matmul_f32` is the name of MemPool application under test, the `type` refers to numpy precision, the `defines` are application parameters, turned into C constant declarations in the form `#define matrix_M (16)`, the `arrays` encode the C-type and name of input vectors for the application under test.
+
+`
+  "matmul_f32": {
+    "type": "float32",
+    "defines": [
+      ("matrix_M", 16)
+      ("matrix_N", 16)
+      ("matrix_P", 16)
+    ]
+    "arrays": [
+      ("float", "l2_A")
+      ("float", "l2_B")
+      ("float", "l2_C")
+    ]
+  }
+`
+
+## To test a new application:
+If a new application requires to be tested with data generated from a reference golden model:
+- Add a new golden model to the existing library `gendatalib.py`.
+- Add a golden model function call to the `gendata_header.py`.
+- Add a new item in the `gendata_params.hjson` to make function parameters configurable.
diff --git a/software/data/data_cfft_radix2_q16.h.tpl b/software/data/data_cfft_radix2_q16.h.tpl
deleted file mode 100644
index 6044e424d..000000000
--- a/software/data/data_cfft_radix2_q16.h.tpl
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Automatically generated by:
-// data/data_cfft_radix2_q16.py
-
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(int16_t) 0X{:04X}, '.format(a&0xffff)
-        i += 1
-        if i % 16 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-<% def array_to_str(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '{}, '.format(a)
-        i += 1
-        if i % 16 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define LOG2 (${Log2Len})
-#define N_CSAMPLES (${Len})
-#define N_TWIDDLES (3 * N_CSAMPLES / 4)
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-#define BITREVINDEXTABLE_LENGTH (${BitrevLen})
-
-// Tolerance for correctness check
-#define TOLERANCE (${tolerance})
-
-% for m, m_str in zip([vector_inp, vector_res], ['l2_pSrc', 'l2_pRes']):
-
-// Data arrays for matrix ${m_str}
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) ${m_str}[${2*Len}] = ${array_to_cstr(m)};
-
-% endfor \
-
-// Twiddles
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)};
-
-// Bitreversal
-uint16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)};
diff --git a/software/data/data_cfft_radix2_q16.py b/software/data/data_cfft_radix2_q16.py
deleted file mode 100644
index e1615e53e..000000000
--- a/software/data/data_cfft_radix2_q16.py
+++ /dev/null
@@ -1,200 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the cfft kernel.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import math as M
-import argparse
-import pathlib
-from mako.template import Template
-from sympy.combinatorics import Permutation
-
-
-##################
-# compute_result #
-##################
-
-
-def compute_result(inp, len):
-    """
-    Funciton to generate the expected result of the testcase.
-
-    Arguments
-    ---------
-    input: numpy array of inputs
-    env: Length of the input transform.
-    """
-
-    # Q16:
-    # len=16:    Q1.15 -> Q5.11
-    # len=32:    Q1.15 -> Q6.10
-    # len=64:    Q1.15 -> Q7.9
-    # len=128:   Q1.15 -> Q8.8
-    # len=256:   Q1.15 -> Q9.7
-    # len=512:   Q1.15 -> Q10.6
-    # len=1024:  Q1.15 -> Q11.5
-    # len=2048:  Q1.15 -> Q12.4
-    # len=4096:  Q1.15 -> Q13.3
-    bit_shift_dict_q16 = {
-        16: 11,
-        32: 10,
-        64: 9,
-        128: 8,
-        256: 7,
-        512: 6,
-        1024: 5,
-        2048: 4,
-        4096: 3}
-    my_type = np.int16
-    my_fixpoint = 15
-    bit_shift_dict = bit_shift_dict_q16
-    a = inp.astype(my_type)
-    result = np.zeros(a.size, dtype=my_type)
-    complex_a = np.zeros(int(a.size / 2), dtype=np.csingle)
-    complex_result = np.zeros(a.size >> 1, dtype=np.csingle)
-    for i in range(a.size >> 1):
-        complex_a[i] = a[2 * i].astype(np.csingle) / (2**(my_fixpoint)) + (
-            a[2 * i + 1].astype(np.csingle) / (2**(my_fixpoint))) * 1j
-    complex_result = np.fft.fft(complex_a)
-    for i in range(int(a.size / 2)):
-        result[2 * i] = (np.real(complex_result[i]) *
-                         (2**(bit_shift_dict[int(a.size / 2)]))
-                         ).astype(my_type)
-        result[2 * i + 1] = (np.imag(complex_result[i]) *
-                             (2**(bit_shift_dict[int(a.size / 2)]))
-                             ).astype(my_type)
-
-    return result
-
-
-def compute_twiddles(length):
-    PI = 3.14159265358979
-    N = length
-    twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16)
-    for i in range(0, (int)(3 * N / 4)):
-        twiddleCoefq15_cos = M.cos(i * 2 * PI / N)
-        twiddleCoefq15_sin = M.sin(i * 2 * PI / N)
-        twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1)))
-        twiddleCoefq15[2 * i +
-                       1] = int(round(twiddleCoefq15_sin * (2**15 - 1)))
-    return twiddleCoefq15
-
-
-def compute_bitreversal(N, R):
-
-    # Decompose
-    logR2 = []
-    idx = N
-    while (idx >= R):
-        logR2.append(int(M.log2(R)))
-        idx = idx // R
-    if (idx > 1):
-        logR2.append(int(M.log2(idx)))
-
-    # Bitreversal
-    indexes = []
-    for x in range(N):
-        result = 0
-        for bits in logR2:
-            mask = (0xffffffff >> (32 - bits))
-            result = (result << bits) | (x & mask)
-            x = x >> bits
-        indexes.append(result)
-
-    # Create transpositions table
-    tps = []
-    for c in Permutation.from_sequence(indexes).cyclic_form:
-        for i in range(len(c) - 1):
-            tps.append([c[i] * 8, c[-1] * 8])
-
-    return tps
-
-
-def gen_data_header_file(
-        outdir: pathlib.Path.cwd(),
-        tpl: pathlib.Path.cwd(),
-        **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_cfft_radix2_q16.h.tpl",
-        help='Path to mako template')
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-d",
-        "--dimension",
-        type=int,
-        required=False,
-        default=64,
-        help='Input dimension'
-    )
-
-    args = parser.parse_args()
-
-    # Create sparse matrix
-    Len = args.dimension
-    Input = np.random.randint(-2**(15), 2**(15) - 1, 2 * Len, dtype=np.int16)
-    Result = compute_result(Input, Len)
-    Twiddles = compute_twiddles(Len)
-    Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(Len, 2)))
-
-    tolerance = {
-        16: 16,
-        32: 20,
-        64: 24,
-        128: 28,
-        256: 32,
-        512: 48,
-        1024: 64,
-        2048: 96,
-        4096: 128}
-
-    kwargs = {'name': 'data_cfft_radix2_q16',
-              'vector_inp': Input,
-              'vector_res': Result,
-              'vector_twi': Twiddles,
-              'vector_bitrev': Bitreversal,
-              'Len': Len,
-              'Log2Len': int(np.log2(Len)),
-              'BitrevLen': int(2 * len(Bitreversal)),
-              'tolerance': tolerance[int(Len)]}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_cfft_radix4_q16.h.tpl b/software/data/data_cfft_radix4_q16.h.tpl
deleted file mode 100644
index 3af1b764d..000000000
--- a/software/data/data_cfft_radix4_q16.h.tpl
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Automatically generated by:
-// data/data_cfft_radix4_q16.py
-
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(int16_t) 0X{:04X}, '.format(a&0xffff)
-        i += 1
-        if i % 16 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-<% def array_to_str(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '{}, '.format(a)
-        i += 1
-        if i % 16 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define LOG2 (${Log2Len})
-#define N_CSAMPLES (${Len})
-#define N_TWIDDLES (3 * N_CSAMPLES / 4)
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-#define BITREVINDEXTABLE_LENGTH (${BitrevLen})
-
-// Maximum number of independent FFT columns allowed
-#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
-// Tolerance for correctness check
-#define TOLERANCE (${tolerance})
-
-% for m, m_str in zip([vector_inp, vector_res], ['l2_pSrc', 'l2_pRes']):
-
-// Data arrays for matrix ${m_str}
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) ${m_str}[${2*Len}] = ${array_to_cstr(m)};
-
-% endfor \
-
-// Twiddles
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)};
-
-// Bitreversal
-uint16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)};
diff --git a/software/data/data_cfft_radix4_q16.py b/software/data/data_cfft_radix4_q16.py
deleted file mode 100755
index b394a2884..000000000
--- a/software/data/data_cfft_radix4_q16.py
+++ /dev/null
@@ -1,200 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the cfft kernel.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import math as M
-import argparse
-import pathlib
-from mako.template import Template
-from sympy.combinatorics import Permutation
-
-
-##################
-# compute_result #
-##################
-
-
-def compute_result(inp, len):
-    """
-    Funciton to generate the expected result of the testcase.
-
-    Arguments
-    ---------
-    input: numpy array of inputs
-    env: Length of the input transform.
-    """
-
-    # Q16:
-    # len=16:    Q1.15 -> Q5.11
-    # len=32:    Q1.15 -> Q6.10
-    # len=64:    Q1.15 -> Q7.9
-    # len=128:   Q1.15 -> Q8.8
-    # len=256:   Q1.15 -> Q9.7
-    # len=512:   Q1.15 -> Q10.6
-    # len=1024:  Q1.15 -> Q11.5
-    # len=2048:  Q1.15 -> Q12.4
-    # len=4096:  Q1.15 -> Q13.3
-    bit_shift_dict_q16 = {
-        16: 11,
-        32: 10,
-        64: 9,
-        128: 8,
-        256: 7,
-        512: 6,
-        1024: 5,
-        2048: 4,
-        4096: 3}
-    my_type = np.int16
-    my_fixpoint = 15
-    bit_shift_dict = bit_shift_dict_q16
-    a = inp.astype(my_type)
-    result = np.zeros(a.size, dtype=my_type)
-    complex_a = np.zeros(int(a.size / 2), dtype=np.csingle)
-    complex_result = np.zeros(a.size >> 1, dtype=np.csingle)
-    for i in range(a.size >> 1):
-        complex_a[i] = a[2 * i].astype(np.csingle) / (2**(my_fixpoint)) + (
-            a[2 * i + 1].astype(np.csingle) / (2**(my_fixpoint))) * 1j
-    complex_result = np.fft.fft(complex_a)
-    for i in range(int(a.size / 2)):
-        result[2 * i] = (np.real(complex_result[i]) *
-                         (2**(bit_shift_dict[int(a.size / 2)]))
-                         ).astype(my_type)
-        result[2 * i + 1] = (np.imag(complex_result[i]) *
-                             (2**(bit_shift_dict[int(a.size / 2)]))
-                             ).astype(my_type)
-
-    return result
-
-
-def compute_twiddles(length):
-    PI = 3.14159265358979
-    N = length
-    twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16)
-    for i in range(0, (int)(3 * N / 4)):
-        twiddleCoefq15_cos = M.cos(i * 2 * PI / N)
-        twiddleCoefq15_sin = M.sin(i * 2 * PI / N)
-        twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1)))
-        twiddleCoefq15[2 * i +
-                       1] = int(round(twiddleCoefq15_sin * (2**15 - 1)))
-    return twiddleCoefq15
-
-
-def compute_bitreversal(N, R):
-
-    # Decompose
-    logR2 = []
-    idx = N
-    while (idx >= R):
-        logR2.append(int(M.log2(R)))
-        idx = idx // R
-    if (idx > 1):
-        logR2.append(int(M.log2(idx)))
-
-    # Bitreversal
-    indexes = []
-    for x in range(N):
-        result = 0
-        for bits in logR2:
-            mask = (0xffffffff >> (32 - bits))
-            result = (result << bits) | (x & mask)
-            x = x >> bits
-        indexes.append(result)
-
-    # Create transpositions table
-    tps = []
-    for c in Permutation.from_sequence(indexes).cyclic_form:
-        for i in range(len(c) - 1):
-            tps.append([c[i] * 8, c[-1] * 8])
-
-    return tps
-
-
-def gen_data_header_file(
-        outdir: pathlib.Path.cwd(),
-        tpl: pathlib.Path.cwd(),
-        **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_cfft_radix4_q16.h.tpl",
-        help='Path to mako template')
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-d",
-        "--dimension",
-        type=int,
-        required=False,
-        default=64,
-        help='Input dimension'
-    )
-
-    args = parser.parse_args()
-
-    # Create sparse matrix
-    Len = args.dimension
-    Input = np.random.randint(-2**(15), 2**(15) - 1, 2 * Len, dtype=np.int16)
-    Result = compute_result(Input, Len)
-    Twiddles = compute_twiddles(Len)
-    Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(Len, 2)))
-
-    tolerance = {
-        16: 16,
-        32: 20,
-        64: 24,
-        128: 28,
-        256: 32,
-        512: 48,
-        1024: 64,
-        2048: 96,
-        4096: 128}
-
-    kwargs = {'name': 'data_cfft_radix4_q16',
-              'vector_inp': Input,
-              'vector_res': Result,
-              'vector_twi': Twiddles,
-              'vector_bitrev': Bitreversal,
-              'Len': Len,
-              'Log2Len': int(np.log2(Len)),
-              'BitrevLen': len(Bitreversal),
-              'tolerance': tolerance[int(Len)]}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_chest_q16.h.tpl b/software/data/data_chest_q16.h.tpl
deleted file mode 100644
index 2e11a26e3..000000000
--- a/software/data/data_chest_q16.h.tpl
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Automatically generated by:
-// data/data_chest_q16.py
-
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(int16_t) 0X{:04X}, '.format(a&0xffff)
-        i += 1
-        if i % 32 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define N_TX (${nb_tx})
-#define N_RX (${nb_rx})
-#define N_SAMPLES (${nb_samples})
-
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_PilotRX[${2*nb_rx*nb_samples}] = ${array_to_cstr(pilot_rx)};
-
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_PilotTX[${2*nb_tx*nb_samples}] = ${array_to_cstr(pilot_tx)};
-
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_HEST[${2*nb_rx*nb_tx*nb_samples}] = ${array_to_cstr(Hest)};
diff --git a/software/data/data_chest_q16.py b/software/data/data_chest_q16.py
deleted file mode 100755
index e1fca8649..000000000
--- a/software/data/data_chest_q16.py
+++ /dev/null
@@ -1,160 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the Channel estimation.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-
-from mako.template import Template
-
-##################
-#  write_result  #
-##################
-
-
-def gen_data_header_file(
-        outdir: pathlib.Path.cwd(),
-        tpl: pathlib.Path.cwd(),
-        **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-######################
-# Fixpoint Functions #
-######################
-
-
-def q_sat(x):
-    if x > 2**15 - 1:
-        return x - 2**16
-    elif x < -2**15:
-        return x + 2**16
-    else:
-        return x
-
-
-def compute_chest_q16(in_rx, in_tx, p):
-    n_rx = in_rx.size
-    n_tx = in_tx.size
-    result = np.zeros(2 * (n_tx * n_rx), dtype=np.int16)
-    for i in range(n_rx):
-        a_r = in_rx[i].real
-        a_i = in_rx[i].imag
-        for j in range(n_tx):
-            b_r = in_tx[j].real
-            b_i = in_tx[j].imag
-
-#            # Compute data division
-#            den = (2**16) // (b_r * b_r + b_i * b_i)
-#            num_r = (a_r * b_r) + (a_i * b_i)
-#            num_i = (a_i * b_r) - (a_r * b_i)
-#            result[2 * (i * n_tx + j)] = q_sat((num_r * den) // 2**p)
-#            result[2 * (i * n_tx + j) + 1] = q_sat((num_i * den) // 2**p)
-
-            # Compute data multiplication
-            num_r = (a_r * b_r) - (a_i * b_i)
-            num_i = (a_i * b_r) + (a_r * b_i)
-            result[2 * (i * n_tx + j)] = q_sat(num_r // 2**p)
-            result[2 * (i * n_tx + j) + 1] = q_sat(num_i // 2**p)
-    return result
-
-
-def generate_chest_q16(nb_tx, nb_rx, nb_samples):
-    FIXED_POINT = 8
-    MAX = 2**7
-
-    qvector_pilot_tx = []
-    qvector_pilot_rx = []
-    qvector_Hest = []
-    for k in range(nb_samples):
-        # Create pilots
-        pilot_rx = np.random.randint(-MAX, MAX - 1, size=nb_rx) + 1j * \
-            np.random.randint(-MAX, MAX - 1, size=nb_rx)
-        pilot_tx = np.random.randint(-MAX, MAX - 1, size=nb_tx) + 1j * \
-            np.random.randint(-MAX, MAX - 1, size=nb_tx)
-        # Compute Hest
-        Hest = compute_chest_q16(pilot_rx, pilot_tx, FIXED_POINT)
-
-        pilot_tx = np.column_stack(
-            (pilot_tx.imag, pilot_tx.real)).astype(
-            np.int16).flatten()
-        pilot_rx = np.column_stack(
-            (pilot_rx.imag, pilot_rx.real)).astype(
-            np.int16).flatten()
-        qvector_pilot_tx.append(pilot_tx)
-        qvector_pilot_rx.append(pilot_rx)
-        qvector_Hest.append(Hest)
-
-    qvector_pilot_tx = np.reshape(qvector_pilot_tx, [2 * nb_tx * nb_samples])
-    qvector_pilot_rx = np.reshape(qvector_pilot_rx, [2 * nb_rx * nb_samples])
-    qvector_Hest = np.reshape(qvector_Hest, [2 * nb_tx * nb_rx * nb_samples])
-    return qvector_pilot_tx, qvector_pilot_rx, qvector_Hest
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-b",
-        "--num_rx",
-        type=int,
-        required=False,
-        default=32,
-        help='Number beams'
-    )
-    parser.add_argument(
-        "-l",
-        "--num_tx",
-        type=int,
-        required=False,
-        default=4,
-        help='Number layers'
-    )
-    parser.add_argument(
-        "-s",
-        "--num_samples",
-        type=int,
-        required=False,
-        default=32,
-        help='Number samples'
-    )
-
-    args = parser.parse_args()
-    nb_tx = args.num_tx
-    nb_rx = args.num_rx
-    nb_samples = args.num_samples
-
-    pilot_tx, pilot_rx, Hest = generate_chest_q16(nb_tx, nb_rx, nb_samples)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_chest_q16.h.tpl"
-    kwargs = {'name': 'data_chest_q16',
-              'pilot_tx': pilot_tx,
-              'pilot_rx': pilot_rx,
-              'Hest': Hest,
-              'nb_tx': nb_tx,
-              'nb_rx': nb_rx,
-              'nb_samples': nb_samples}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_matmul_f16.h.tpl b/software/data/data_matmul_f16.h.tpl
deleted file mode 100644
index 96aa738a3..000000000
--- a/software/data/data_matmul_f16.h.tpl
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(__fp16){:.4f}f, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define matrix_M (${matrix_M})
-#define matrix_N (${matrix_N})
-#define matrix_P (${matrix_P})
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) A[${matrix_M * matrix_N}] = ${array_to_cstr(A)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) B[${matrix_N * matrix_P}] = ${array_to_cstr(B)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) C[${matrix_M * matrix_P}] = ${array_to_cstr(C)};
diff --git a/software/data/data_matmul_f32.h.tpl b/software/data/data_matmul_f32.h.tpl
deleted file mode 100644
index 4e9e6a4d6..000000000
--- a/software/data/data_matmul_f32.h.tpl
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '{}f, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define matrix_M (${matrix_M})
-#define matrix_N (${matrix_N})
-#define matrix_P (${matrix_P})
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) A[${matrix_M * matrix_N}] = ${array_to_cstr(A)};
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) B[${matrix_N * matrix_P}] = ${array_to_cstr(B)};
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) C[${matrix_M * matrix_P}] = ${array_to_cstr(C)};
diff --git a/software/data/data_matmulf16.py b/software/data/data_matmulf16.py
deleted file mode 100644
index 2c362208b..000000000
--- a/software/data/data_matmulf16.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp16 matmul.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from mako.template import Template
-
-
-##################
-# compute_result #
-##################
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_matmul_f16.h.tpl",
-        help='Path to mako template'
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-m",
-        "--dim_m",
-        type=int,
-        required=False,
-        default=16,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-n",
-        "--dim_n",
-        type=int,
-        required=False,
-        default=16,
-        help='Second dimension.'
-    )
-    parser.add_argument(
-        "-p",
-        "--dim_p",
-        type=int,
-        required=False,
-        default=16,
-        help='Third dimension.'
-    )
-
-    args = parser.parse_args()
-
-    matrix_M = args.dim_m
-    matrix_N = args.dim_n
-    matrix_P = args.dim_p
-
-    # Create matrix
-    A = (np.random.rand(matrix_M, matrix_N) - 0.5).astype(np.float16)
-    B = (np.random.rand(matrix_N, matrix_P) - 0.5).astype(np.float16)
-    C = np.matmul(A, B)
-
-    A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(np.float16)
-    B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(np.float16)
-    C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.float16)
-
-    kwargs = {
-        'name': 'data_matmul_f16',
-        'A': A,
-        'B': B,
-        'C': C,
-        'matrix_M': matrix_M,
-        'matrix_N': matrix_N,
-        'matrix_P': matrix_P}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_matmulf32.py b/software/data/data_matmulf32.py
deleted file mode 100644
index 15086d0fc..000000000
--- a/software/data/data_matmulf32.py
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp32 matmul.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from mako.template import Template
-
-
-##################
-# compute_result #
-##################
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_matmul_f32.h.tpl",
-        help='Path to mako template'
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-
-    parser.add_argument(
-        "-m",
-        "--dim_m",
-        type=int,
-        required=False,
-        default=16,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-n",
-        "--dim_n",
-        type=int,
-        required=False,
-        default=16,
-        help='Second dimension.'
-    )
-    parser.add_argument(
-        "-p",
-        "--dim_p",
-        type=int,
-        required=False,
-        default=16,
-        help='Third dimension.'
-    )
-
-    args = parser.parse_args()
-
-    matrix_M = args.dim_m
-    matrix_N = args.dim_n
-    matrix_P = args.dim_p
-
-    # Create matrix
-    A = np.random.rand(matrix_M, matrix_N)
-    B = np.random.rand(matrix_N, matrix_P)
-    C = np.matmul(A, B)
-
-    A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(np.float32)
-    B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(np.float32)
-    C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.float32)
-
-    kwargs = {
-        'name': 'data_matmul_f32',
-        'A': A,
-        'B': B,
-        'C': C,
-        'matrix_M': matrix_M,
-        'matrix_N': matrix_N,
-        'matrix_P': matrix_P}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/gendata_header.py b/software/data/gendata_header.py
new file mode 100644
index 000000000..44749a4a0
--- /dev/null
+++ b/software/data/gendata_header.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE for details.
+# SPDX-License-Identifier: SHL-0.51
+
+# This script generates data.h files.
+# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
+
+import argparse
+import os
+import hjson
+import ast
+import numpy
+
+import gendatalib as datalib
+
+
+header = """\
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// File generated with .data/print_header.py
+// Author: Marco Bertuletti\n\n
+"""
+
+
+def format_type(typ, value):
+    """
+    formats the type for printing in .h file.
+    :param typ: Input type
+    :param value: Input_value
+    """
+    typ_i32b = ["int32_t", "uint32_t"]
+    typ_i16b = ["int16_t", "uint16_t"]
+    typ_i8b = ["int8_t", "uint8_t"]
+
+    if typ in typ_i32b:
+        stringyfied_val = '({}) 0X{:08X}'.format(typ, value & 0xffffffff)
+    elif typ in typ_i16b:
+        stringyfied_val = '({}) 0X{:04X}'.format(typ, value & 0x0000ffff)
+    elif typ in typ_i8b:
+        stringyfied_val = '({}) 0X{:02X}'.format(typ, value & 0x000000ff)
+    elif typ == 'float':
+        stringyfied_val = '({}) {:+.8f}'.format(typ, value)
+    elif typ == '__fp16':
+        stringyfied_val = '({}) {:+.4f}'.format(typ, value)
+    else:
+        raise Exception("ERROR: Unsupported data type!!!")
+
+    return stringyfied_val
+
+
+def print_array(arr, typ, name):
+    """
+    Converts arrays to a string.
+
+    :param arr: Input array
+    :param typ: Type of the array.
+    :param name: Name of the array.
+    """
+
+    output_string = typ
+    attr = " __attribute__((aligned(sizeof(int32_t)), section(\".l2\"))) "
+    if (arr.size > 1):
+        output_string += attr
+        output_string += name + '[{}] = {{\n'.format(arr.size)
+        for (value, count) in zip(arr, range(arr.size)):
+            output_string += (format_type(typ, value) + ', ')
+            count += 1
+            if count % 4 == 0:
+                output_string += '\n'
+        output_string = output_string[:-3]
+        output_string += "};\n\n"
+    else:
+        output_string += attr
+        output_string += (name + ' = ' + format_type(typ, arr))
+        output_string += ";\n\n"
+
+    return output_string
+
+
+def print_file(header, defines, arrays, filename):
+    """
+    Writes defines and arrays to a file.
+
+    :param header: Header of the printed file
+    :param defines: A tuple of (define_name, define_value).
+    :param arrays: A tuple of (array_name, array_type, array_values).
+    :param filename: The output file to write to.
+    """
+
+    # Initialize the output string
+    output_string = header
+
+    # Write the defines
+    for def_key, def_value in defines.items():
+        output_string += "#define {} ({})\n".format(def_key, def_value)
+    output_string += "\n"  # Add space between defines and arrays
+
+    # Write the arrays using print_array
+    for array_values, array_type, array_name in arrays:
+        output_string += print_array(array_values, array_type, array_name)
+
+    # Write everything to the file
+    with open(filename, "w") as file:
+        file.write(output_string)
+
+    print("Generate {}".format(filename))
+
+
+def get_type(type_string):
+    """
+    Gets the numpy type from the type specifyied in the json
+    :param type_string: type from json file.
+    """
+    if type_string == "int8":
+        return numpy.int8
+    elif type_string == "int16":
+        return numpy.int16
+    elif type_string == "int32":
+        return numpy.int32
+    elif type_string == "float32":
+        return numpy.float32
+    elif type_string == "float16":
+        return numpy.float16
+    else:
+        raise Exception("Input type is not valid")
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description='Generate data.h header files.')
+    parser.add_argument('--app_name', type=str, help='Name of the app')
+    parser.add_argument('--params', type=str, help='Name of the app')
+
+    # Parse the command-line arguments
+    args = parser.parse_args()
+    app_name = args.app_name
+    with open(args.params, 'r') as hjson_file:
+        config_data = hjson.load(hjson_file)
+    data_args = config_data.get(app_name)
+
+    if data_args is not None:
+        my_type = get_type(data_args.get("type"))
+        defnes = dict([ast.literal_eval(defne)
+                      for defne in data_args.get("defines")])
+        arrays = [ast.literal_eval(array) for array in data_args.get("arrays")]
+
+    # Determine output file name
+    filename = os.path.dirname(os.path.abspath(__file__))
+    filename = os.path.join(filename, "data_{}.h".format(app_name))
+
+    # Define function mappings for each app_name
+    function_map = {
+        "axpy_i32": {"func": datalib.generate_iaxpy},
+        "cfft_radix4_q16": {"func": datalib.generate_cfft_q16},
+        "cfft_radix2_q16": {"func": datalib.generate_cfft_q16},
+        "chest_q16": {"func": datalib.generate_qchest},
+        "cholesky_q32": {"func": datalib.generate_qcholesky},
+        "dotp_i32": {"func": datalib.generate_idotp},
+        "matmul_f16": {"func": datalib.generate_fmatmul},
+        "matmul_f32": {"func": datalib.generate_fmatmul},
+        "matmul_i32": {"func": datalib.generate_imatmul},
+        "matmul_i16": {"func": datalib.generate_imatmul},
+        "matmul_i8": {"func": datalib.generate_imatmul},
+        "fence": {"func": datalib.generate_iarray},
+        "memcpy": {"func": datalib.generate_iarray},
+    }
+
+    # Check if app_name exists in the function map
+    if app_name in function_map:
+        func_info = function_map[app_name]
+        func = func_info["func"]
+        # Call the function
+        # The defnes dictionary is a function argument in case the generate
+        # function adds new definitions.
+        result, defnes = func(defines=defnes, my_type=my_type)
+        # Print result to data header
+        if len(arrays) == 1:
+            arrays = [(result, *arrays[0])]
+        else:
+            arrays = [(result[i], *arrays[i]) for i in range(len(arrays))]
+        print_file(header, defnes, arrays, filename)
+
+    else:
+        print("Data generation is not defined.")
diff --git a/software/data/gendata_params.hjson b/software/data/gendata_params.hjson
new file mode 100644
index 000000000..3a1de010e
--- /dev/null
+++ b/software/data/gendata_params.hjson
@@ -0,0 +1,177 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// This script generates data.h files.
+// Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
+
+{
+  "axpy_i32": {
+    "type": "int32",
+    "defines": [
+      ("ALPHA",      6)
+      ("array_N", 1024)
+    ]
+    "arrays": [
+      ("int32_t", "l2_X")
+      ("int32_t", "l2_Y")
+      ("int32_t", "l2_Z")
+    ]
+  },
+
+  "dotp_i32": {
+    "type": "int32",
+    "defines": [
+      ("array_N", 1024)
+    ]
+    "arrays": [
+      ("int32_t", "l2_X")
+      ("int32_t", "l2_Y")
+      ("int32_t", "l2_Z")
+    ]
+  },
+
+  "cfft_radix4_q16": {
+    "type": "int16",
+    "defines": [
+      ("N_CSAMPLES", 64)
+    ]
+    "arrays": [
+      ("int16_t", "l2_pSrc")
+      ("int16_t", "l2_pRes")
+      ("int16_t", "l2_twiddleCoef_q16")
+      ("int16_t", "l2_BitRevIndexTable")
+    ]
+  },
+
+  "cfft_radix2_q16": {
+    "type": "int16",
+    "defines": [
+      ("N_CSAMPLES", 256)
+    ]
+    "arrays": [
+      ("int16_t", "l2_pSrc")
+      ("int16_t", "l2_pRes")
+      ("int16_t", "l2_twiddleCoef_q16")
+      ("int16_t", "l2_BitRevIndexTable")
+    ]
+  },
+
+  "chest_q16": {
+    "type": "int32",
+    "defines": [
+      ("N_TX",        4)
+      ("N_RX",        4)
+      ("N_SAMPLES", 512)
+    ]
+    "arrays": [
+      ("int16_t", "l2_PilotTX")
+      ("int16_t", "l2_PilotRX")
+      ("int16_t", "l2_HEST")
+    ]
+  },
+
+  "cholesky_q32": {
+    "type": "int32",
+    "defines": [
+      ("matrix_N",    32)
+      ("FIXED_POINT", 10)
+    ]
+    "arrays": [
+      ("int32_t", "l2_A")
+      ("int32_t", "l2_L")
+      ("int32_t", "l2_y")
+    ]
+  },
+
+  "matmul_f16": {
+    "type": "float16",
+    "defines": [
+      ("matrix_M", 32)
+      ("matrix_N", 32)
+      ("matrix_P", 32)
+    ]
+    "arrays": [
+      ("__fp16", "l2_A")
+      ("__fp16", "l2_B")
+      ("__fp16", "l2_C")
+    ]
+  },
+
+  "matmul_f32": {
+    "type": "float32",
+    "defines": [
+      ("matrix_M", 16)
+      ("matrix_N", 16)
+      ("matrix_P", 16)
+    ]
+    "arrays": [
+      ("float", "l2_A")
+      ("float", "l2_B")
+      ("float", "l2_C")
+    ]
+  }
+
+  "matmul_i32": {
+    "type": "int32",
+    "defines": [
+      ("matrix_M", 32)
+      ("matrix_N", 32)
+      ("matrix_P", 32)
+    ]
+    "arrays": [
+      ("int32_t", "l2_A")
+      ("int32_t", "l2_B")
+      ("int32_t", "l2_C")
+    ]
+  }
+
+  "matmul_i16": {
+    "type": "int16",
+    "defines": [
+      ("matrix_M", 64)
+      ("matrix_N", 64)
+      ("matrix_P", 64)
+    ]
+    "arrays": [
+      ("int16_t", "l2_A")
+      ("int16_t", "l2_B")
+      ("int32_t", "l2_C")
+    ]
+  }
+
+  "matmul_i8": {
+    "type": "int8",
+    "defines": [
+      ("matrix_M", 64)
+      ("matrix_N", 64)
+      ("matrix_P", 64)
+    ]
+    "arrays": [
+      ("int8_t", "l2_A")
+      ("int8_t", "l2_B")
+      ("int32_t", "l2_C")
+    ]
+  }
+
+  "fence": {
+    "type": "int32",
+    "defines": [
+      ("array_N", 12288)
+    ]
+    "arrays": [
+      ("int32_t", "l2_data")
+    ]
+  },
+
+  "memcpy": {
+    "type": "int32",
+    "defines": [
+      ("array_N", 2048)
+    ]
+    "arrays": [
+      ("int32_t", "l2_data")
+    ]
+  },
+
+}
diff --git a/software/data/gendatalib.py b/software/data/gendatalib.py
new file mode 100644
index 000000000..c017415bf
--- /dev/null
+++ b/software/data/gendatalib.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE for details.
+# SPDX-License-Identifier: SHL-0.51
+
+# This script generates data for the fp16 matmul.
+# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
+
+# The script generates random inputs for the C functions. The inputs are
+# propagated though a python golden model. Golden models are from the
+# numpy library or the qmath bit-true library.
+
+import numpy as np
+import math
+import qmath
+from scipy import signal
+
+
+def select_maxval(my_type=np.int32):
+    size = 8 * np.dtype(my_type).itemsize
+    MAX = 2**(size - 2) - 1
+    return MAX
+
+
+def irandom(size, MAX, my_type=np.int16):
+    """Generate random numbers.
+    size (int or tuple): Size of the array to generate.
+    mytype (np.dtype): Data type for the fixed-point representation.
+    Defaults to np.int16.
+
+    Returns:
+        np.ndarray: Array of random fixed-point numbers.
+    """
+    return np.random.randint(-MAX, MAX - 1, size=size, dtype=my_type)
+
+
+def icrandom(size, MAX, my_type=np.int16):
+    """Generate random complex numbers.
+    size (int or tuple): Size of the array to generate.
+    mytype (np.dtype): Data type for the fixed-point representation.
+    Defaults to np.int16.
+
+    Returns:
+        np.ndarray: Array of random complex fixed-point numbers.
+    """
+    real_part = np.random.randint(-MAX, MAX - 1, size=size, dtype=my_type)
+    imag_part = np.random.randint(-MAX, MAX - 1, size=size, dtype=my_type)
+    return real_part + 1j * imag_part
+
+
+def generate_iarray(my_type=np.float32, defines={}):
+
+    # Create random array of integers
+    array_N = defines['array_N']
+    MAX = select_maxval(my_type)
+    A = irandom(MAX=MAX, size=(array_N), my_type=my_type)
+    return A, defines
+
+
+def generate_fmatmul(my_type=np.float32, defines={}):
+
+    # Create matrix
+    matrix_M = defines['matrix_M']
+    matrix_N = defines['matrix_N']
+    matrix_P = defines['matrix_P']
+    A = (np.random.rand(matrix_M, matrix_N) - 0.5).astype(my_type)
+    B = (np.random.rand(matrix_N, matrix_P) - 0.5).astype(my_type)
+    C = np.matmul(A, B)
+
+    A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(my_type)
+    B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(my_type)
+    C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(my_type)
+
+    return [A, B, C], defines
+
+
+def generate_imatmul(my_type=np.int32, defines={}):
+
+    # Create matrix
+    matrix_M = defines['matrix_M']
+    matrix_N = defines['matrix_N']
+    matrix_P = defines['matrix_P']
+    MAX = select_maxval(my_type)
+    A = irandom(MAX=MAX, size=(matrix_M, matrix_N), my_type=my_type)
+    B = irandom(MAX=MAX, size=(matrix_M, matrix_N), my_type=my_type)
+    C = np.matmul(A, B)
+
+    A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(my_type)
+    B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(my_type)
+    C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.int32)
+
+    return [A, B, C], defines
+
+
+def generate_iaxpy(my_type=np.int32, defines={}):
+
+    # Create matrix
+    ALPHA = defines['ALPHA']
+    array_N = defines['array_N']
+    MAX = select_maxval(my_type)
+    X = irandom(MAX=MAX, size=(array_N), my_type=my_type)
+    Y = irandom(MAX=MAX, size=(array_N), my_type=my_type)
+    Z = (Y + X * ALPHA).astype(my_type)
+
+    return [X, Y, Z], defines
+
+
+def generate_idotp(my_type=np.int32, defines={}):
+
+    # Create matrix
+    array_N = defines['array_N']
+    MAX = select_maxval(my_type)
+    X = irandom(MAX=MAX, size=(array_N), my_type=my_type)
+    Y = irandom(MAX=MAX, size=(array_N), my_type=my_type)
+    Z = np.array((np.dot(X, Y))).astype(my_type)
+
+    return [X, Y, Z], defines
+
+
+def generate_iconv(my_type=np.int32, defines={}):
+
+    # Create matrix
+    matrix_M = defines['matrix_M']
+    matrix_N = defines['matrix_N']
+    kernel_N = defines['kernel_N']
+    MAX = select_maxval(my_type)
+    X = irandom(MAX=MAX, size=(matrix_M, matrix_N), my_type=my_type)
+    K = irandom(MAX=MAX, size=(kernel_N, kernel_N), my_type=my_type)
+    Y = signal.convolve2d(X, K, mode="same", boundary='fill')
+
+    X = X.flatten().astype(my_type)
+    K = K.flatten().astype(my_type)
+    Y = Y.flatten().astype(my_type)
+
+    return [X, K, Y], defines
+
+
+def generate_qchest(defines={}, fixed_point=15, my_type=np.int16):
+
+    N_TX = defines['N_TX']
+    N_RX = defines['N_RX']
+    N_SAMPLES = defines['N_SAMPLES']
+
+    qvector_pilot_tx = []
+    qvector_pilot_rx = []
+    qvector_Hest = []
+    for k in range(N_SAMPLES):
+        # Create pilots
+        pilot_rx = icrandom(size=N_RX, MAX=2**7, my_type=np.int32)
+        pilot_tx = icrandom(size=N_TX, MAX=2**7, my_type=np.int32)
+        # Compute Hest
+        Hest = qmath.qchest(pilot_rx, pilot_tx, fixed_point=8)
+
+        pilot_tx = np.column_stack((pilot_tx.imag, pilot_tx.real))
+        pilot_rx = np.column_stack((pilot_rx.imag, pilot_rx.real))
+        qvector_pilot_tx.append(pilot_tx.astype(np.int16).flatten())
+        qvector_pilot_rx.append(pilot_rx.astype(np.int16).flatten())
+        qvector_Hest.append(Hest)
+
+    qvector_pilot_tx = np.reshape(qvector_pilot_tx, [2 * N_TX * N_SAMPLES])
+    qvector_pilot_rx = np.reshape(qvector_pilot_rx, [2 * N_RX * N_SAMPLES])
+    qvector_Hest = np.reshape(qvector_Hest, [2 * N_TX * N_RX * N_SAMPLES])
+    return [qvector_pilot_tx, qvector_pilot_rx, qvector_Hest], defines
+
+
+def generate_qcholesky(defines={}, fixed_point=15, my_type=np.int32):
+
+    matrix_N = defines['matrix_N']
+    FIXED_POINT = defines['FIXED_POINT']
+
+    A = irandom(size=(matrix_N, matrix_N), MAX=2**14, my_type=my_type)
+    y = irandom(size=matrix_N, MAX=2**14, my_type=my_type)
+    A = qmath.qmatmul(A.T, A, FIXED_POINT, my_type)
+    L = qmath.qcholesky(A, fixed_point=FIXED_POINT, mytype=my_type)
+
+    A = np.reshape(A, (matrix_N * matrix_N), order='C').astype(my_type)
+    L = np.reshape(L, (matrix_N * matrix_N), order='C').astype(my_type)
+    return [A, L, y], defines
+
+
+def generate_cfft_q16(defines={}, fixed_point=15, my_type=np.int16):
+
+    N_CSAMPLES = defines['N_CSAMPLES']
+    src = icrandom(size=N_CSAMPLES, MAX=2**fixed_point, my_type=my_type)
+    tolerance = {
+        16: 16,
+        32: 20,
+        64: 24,
+        128: 28,
+        256: 32,
+        512: 48,
+        1024: 64,
+        2048: 96,
+        4096: 128}
+    bit_shift_dict_q16 = {
+        16: 11,
+        32: 10,
+        64: 9,
+        128: 8,
+        256: 7,
+        512: 6,
+        1024: 5,
+        2048: 4,
+        4096: 3}
+
+    dst = np.fft.fft(src.astype(np.csingle) / (2**fixed_point))
+    dst = dst * 2**(bit_shift_dict_q16[N_CSAMPLES])
+
+    dst = (np.column_stack((dst.real, dst.imag))).flatten()
+    src = (np.column_stack((src.real, src.imag))).flatten()
+    dst = dst.astype(np.int16)
+    src = src.astype(np.int16)
+
+    twiddles = qmath.qtwiddleCoef(N_CSAMPLES)
+    bitrever = qmath.bitreversal(N_CSAMPLES, 2)
+
+    defines['LOG2'] = int(math.log2(N_CSAMPLES))
+    defines['N_TWIDDLES'] = 3 * N_CSAMPLES // 4
+    defines['BITREVINDEXTABLE_LENGTH'] = len(bitrever)
+    defines['TOLERANCE'] = tolerance[N_CSAMPLES]
+
+    return [src, dst, twiddles, bitrever], defines
diff --git a/software/data/qmath.py b/software/data/qmath.py
new file mode 100644
index 000000000..404d7b407
--- /dev/null
+++ b/software/data/qmath.py
@@ -0,0 +1,446 @@
+#!/usr/bin/env python3
+
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE for details.
+# SPDX-License-Identifier: SHL-0.51
+
+# This script generates data for the fp16 mmse.
+# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
+
+import numpy as np
+import math
+from sympy.combinatorics import Permutation
+
+
+def to_fixed_point(matrix, fixed_point=15, mytype=np.int16):
+    """Convert a complex matrix to a fixed-point matrix.
+    matrix (np.ndarray): Input complex matrix.
+    fixed_point (int): Number of bits for the fractional part.
+    mytype (np.dtype): Data type for the fixed-point representation.
+
+    Returns:
+        tuple: Real and imaginary parts of the fixed-point matrix.
+    """
+    SCALE_FACTOR = 2**fixed_point
+    real_part = np.round(matrix.real * SCALE_FACTOR).astype(mytype)
+    imag_part = np.round(matrix.imag * SCALE_FACTOR).astype(mytype)
+    if (np.abs(real_part.any()) > 2**(fixed_point - 1)):
+        raise ValueError("Overflow")
+    if (np.abs(imag_part.any()) > 2**(fixed_point - 1)):
+        raise ValueError("Overflow")
+    return real_part, imag_part
+
+
+def from_fixed_point(real_part, imag_part, fixed_point=15, mytype=np.int16):
+    """Convert a fixed-point matrix back to a floating-point complex matrix.
+    real_part (np.ndarray): Real part of the fixed-point matrix.
+    imag_part (np.ndarray): Imaginary part of the fixed-point matrix.
+    fixed_point (int): Number of bits for the fractional part.
+    mytype (np.dtype): Data type for the fixed-point representation.
+
+    Returns:
+        np.ndarray: Reconstructed complex matrix.
+    """
+    SCALE_FACTOR = 2**fixed_point
+    return (real_part / SCALE_FACTOR) + 1j * (imag_part / SCALE_FACTOR)
+
+
+def qmatmul(A, B, fixed_point=15, mytype=np.int16):
+    """Perform fixed-point matrix multiplication.
+    A (np.ndarray): First matrix.
+    B (np.ndarray): Second matrix.
+    fixed_point (int): Number of bits for the fractional part.
+    mytype (np.dtype): Data type for the fixed-point representation.
+
+    Returns:
+        np.ndarray: Fixed-point result of the matrix multiplication.
+    """
+    SCALE_FACTOR = 2**fixed_point
+    rows_A, cols_A = A.shape
+    cols_B = B.shape[1]
+    C = np.zeros((rows_A, cols_B), dtype=mytype)
+
+    for i in range(rows_A):
+        for j in range(cols_B):
+            for k in range(cols_A):
+                C[i, j] += A[i, k] * B[k, j] // SCALE_FACTOR
+    return C
+
+
+def qcmatmul(A_real, A_imag, B_real, B_imag, fixed_point=15, mytype=np.int16):
+    """Perform fixed-point complex matrix multiplication.
+    A_real (np.ndarray): Real part of the first matrix.
+    A_imag (np.ndarray): Imaginary part of the first matrix.
+    B_real (np.ndarray): Real part of the second matrix.
+    B_imag (np.ndarray): Imaginary part of the second matrix.
+    fixed_point (int): Number of bits for the fractional part.
+    mytype (np.dtype): Data type for the fixed-point representation.
+
+    Returns:
+        tuple: Real and imaginary parts of the result matrix.
+    """
+    SCALE_FACTOR = 2**fixed_point
+    rows_A, cols_A = A_real.shape
+    cols_B = B_real.shape[1]
+
+    C_real = np.zeros((rows_A, cols_B), dtype=mytype)
+    C_imag = np.zeros((rows_A, cols_B), dtype=mytype)
+
+    for i in range(rows_A):
+        for j in range(cols_B):
+            for k in range(cols_A):
+                real_product = A_real[i, k] * \
+                    B_real[k, j] - A_imag[i, k] * B_imag[k, j]
+                imag_product = A_real[i, k] * \
+                    B_imag[k, j] + A_imag[i, k] * B_real[k, j]
+
+                C_real[i, j] += real_product // SCALE_FACTOR
+                C_imag[i, j] += imag_product // SCALE_FACTOR
+
+    return C_real, C_imag
+
+
+def qcmvmul(A_real, A_imag, B_real, B_imag, fixed_point=15, mytype=np.int16):
+    """Perform fixed-point complex matrix-vector multiplication.
+    A_real (np.ndarray): Real part of the matrix.
+    A_imag (np.ndarray): Imaginary part of the matrix.
+    B_real (np.ndarray): Real part of the vector.
+    B_imag (np.ndarray): Imaginary part of the vector.
+    fixed_point (int): Number of bits for the fractional part.
+    mytype (np.dtype): Data type for the fixed-point representation.
+
+    Returns:
+        tuple: Real and imaginary parts of the result vector.
+    """
+    SCALE_FACTOR = 2**fixed_point
+    rows_A, cols_A = A_real.shape
+
+    C_real = np.zeros(rows_A, dtype=mytype)
+    C_imag = np.zeros(rows_A, dtype=mytype)
+
+    for i in range(rows_A):
+        for k in range(cols_A):
+            real_product = A_real[i, k] * B_real[k] - A_imag[i, k] * B_imag[k]
+            imag_product = A_real[i, k] * B_imag[k] + A_imag[i, k] * B_real[k]
+
+            C_real[i] += real_product // SCALE_FACTOR
+            C_imag[i] += imag_product // SCALE_FACTOR
+
+    return C_real, C_imag
+
+
+def qsqrt(n, fixed_point=15, mytype=np.int16):
+    """Compute the square root of a number in fixed-point representation using
+    Newton-Raphson method.
+    n (np.ndarray): Input value(s) in fixed-point representation.
+    fixed_point (int): Number of bits for the fractional part.
+    mytype (np.dtype): Data type for the fixed-point representation.
+
+    Returns:
+        np.ndarray: Square root of the input in fixed-point representation.
+    """
+    SCALE_FACTOR = 2**fixed_point
+    x = np.ones_like(n, dtype=mytype) * SCALE_FACTOR
+    n_one = n * SCALE_FACTOR
+
+    itr = 0
+    while True:
+        x_old = x
+        x = (x + n_one // x) // 2
+        if np.array_equal(
+                x, x_old) or itr == 10:  # Convergence or max iterations
+            break
+        itr += 1
+    return x
+
+
+def qcholesky(A, fixed_point=15, mytype=np.int16):
+    """Perform fixed-point Cholesky decomposition of a symmetric
+    positive-definite matrix.
+    A (np.ndarray): Input matrix (must be square and symmetric).
+    fixed_point (int): Number of bits for the fractional part.
+    mytype (np.dtype): Data type for the fixed-point representation.
+
+    Returns:
+        tuple: Flattened input matrix, flattened lower triangular matrix, and
+        result vector.
+    """
+    SCALE_FACTOR = 2**fixed_point
+    rows, columns = A.shape
+    if rows != columns:
+        raise ValueError("Matrix must be square for Cholesky decomposition.")
+
+    L = np.zeros((rows, columns), dtype=mytype)
+
+    for row in range(rows):
+        for column in range(columns):
+            if row == column:
+                pivot = A[row, column]
+                for k in range(column):
+                    Ljk = L[row, k]
+                    pivot -= (Ljk**2) // SCALE_FACTOR
+                if pivot < 0:
+                    # raise ValueError("Negative value encountered in diagonal
+                    # element.")
+                    pivot = 0
+                L[row, column] = qsqrt(pivot, fixed_point, mytype)
+            elif row > column:
+                pivot = A[row, column]
+                for k in range(column):
+                    Lik = L[row, k]
+                    Ljk = L[column, k]
+                    pivot -= (Lik * Ljk) // SCALE_FACTOR
+                diag = L[column, column]
+                L[row, column] = (pivot * SCALE_FACTOR) // diag
+            else:
+                L[row, column] = 0
+
+    return L
+
+
+def qccholesky(M_real, M_imag, fixed_point=15, mytype=np.int16):
+    """Perform fixed-point Cholesky decomposition of a symmetric
+    positive-definite matrix.
+    A (np.ndarray): Input matrix (must be square and symmetric).
+    fixed_point (int): Number of bits for the fractional part.
+    mytype (np.dtype): Data type for the fixed-point representation.
+
+    Returns:
+        tuple: Flattened input matrix, flattened lower triangular matrix,
+        and result vector.
+    """
+
+    SCALE_FACTOR = 2**fixed_point
+    NEGATIVE = fixed_point**2 + 1
+
+    rows, columns = M_real.shape
+    L_real = np.zeros_like(M_real, dtype=mytype)  # Initialize dest with zeros
+    L_imag = np.zeros_like(M_imag, dtype=mytype)  # Initialize dest with zeros
+
+    # Check for dimensional errors
+    if rows != columns:
+        raise ValueError("Matrix must be square for Cholesky decomposition.")
+
+    for row in range(rows):
+        for column in range(columns):
+
+            if row == column:
+                # Diagonal element
+                real_pivot = M_real[row, column]
+                for k in range(column):
+                    real_Ljk = L_real[row, k]
+                    imag_Ljk = L_imag[row, k]
+                    product = (real_Ljk**2 + imag_Ljk**2) // SCALE_FACTOR
+                    real_pivot = real_pivot - product
+
+                # Handle negative values for square root
+                if real_pivot < 0:
+                    if real_pivot < NEGATIVE:
+                        raise ValueError("Negative value encountered.")
+                    real_pivot = 0
+                L_real[row, column] = qsqrt(real_pivot, fixed_point, mytype)
+
+            elif row > column:
+                # Off-diagonal element (below the diagonal)
+                real_pivot = M_real[row, column]
+                imag_pivot = M_imag[row, column]
+
+                for k in range(column):
+                    real_Lik = L_real[row, k]
+                    imag_Lik = L_imag[row, k]
+                    real_Ljk = L_real[column, k]
+                    imag_Ljk = L_imag[column, k]
+                    real_product = (real_Lik * real_Ljk - imag_Lik * imag_Ljk)
+                    imag_product = (real_Lik * imag_Ljk + imag_Lik * real_Ljk)
+                    real_product = real_product // SCALE_FACTOR
+                    imag_product = imag_product // SCALE_FACTOR
+                    real_pivot = real_pivot - real_product
+                    imag_pivot = imag_pivot - imag_product
+
+                diag = L_real[column, column]
+                L_real[row, column] = (real_pivot * SCALE_FACTOR) // diag
+                L_imag[row, column] = (imag_pivot * SCALE_FACTOR) // diag
+
+            else:
+                # Above diagonal, set to zero
+                L_real[row, column] = 0
+                L_imag[row, column] = 0
+
+    return L_real, L_imag
+
+
+def qinvertLt(M_real, M_imag, y_real, y_imag, fixed_point=15, mytype=np.int16):
+    """Invert a lower triangular complex matrix using fixed-point arithmetic.
+    M_real (np.ndarray): Real part of the lower triangular matrix.
+    M_imag (np.ndarray): Imaginary part of the lower triangular matrix.
+    y_real (np.ndarray): Real part of the vector.
+    y_imag (np.ndarray): Imaginary part of the vector.
+    fixed_point (int): Number of bits for the fractional part.
+    mytype (np.dtype): Data type for the fixed-point representation.
+
+    Returns:
+    tuple: Real and imaginary parts of the result vector.
+    """
+    SCALE_FACTOR = 2**fixed_point
+    n = M_real.shape[0]
+    x_real = np.zeros_like(y_real, dtype=mytype)
+    x_imag = np.zeros_like(y_imag, dtype=mytype)
+
+    for i in range(n):
+        sum_real = y_real[i]
+        sum_imag = y_imag[i]
+        for j in range(i):
+            sum_real -= (M_real[i, j] * x_real[j] -
+                         M_imag[i, j] * x_imag[j]) // SCALE_FACTOR
+            sum_imag -= (M_real[i, j] * x_imag[j] +
+                         M_imag[i, j] * x_real[j]) // SCALE_FACTOR
+
+        x_real[i] = (sum_real * SCALE_FACTOR) // M_real[i, i]
+        x_imag[i] = (sum_imag * SCALE_FACTOR) // M_real[i, i]
+
+    return x_real, x_imag
+
+
+def qinvertUt(M_real, M_imag, y_real, y_imag, fixed_point=15, mytype=np.int16):
+    """Invert an upper triangular complex matrix using fixed-point arithmetic.
+    M_real (np.ndarray): Real part of the upper triangular matrix.
+    M_imag (np.ndarray): Imaginary part of the upper triangular matrix.
+    y_real (np.ndarray): Real part of the vector.
+    y_imag (np.ndarray): Imaginary part of the vector.
+    fixed_point (int): Number of bits for the fractional part.
+    mytype (np.dtype): Data type for the fixed-point representation.
+
+    Returns:
+    tuple: Real and imaginary parts of the result vector.
+    """
+    SCALE_FACTOR = 2**fixed_point
+    n = M_real.shape[0]
+    x_real = np.zeros_like(y_real, dtype=mytype)
+    x_imag = np.zeros_like(y_imag, dtype=mytype)
+
+    for i in range(n - 1, -1, -1):
+        sum_real = y_real[i]
+        sum_imag = y_imag[i]
+
+        for j in range(i + 1, n):
+            sum_real -= (M_real[i, j] * x_real[j] -
+                         M_imag[i, j] * x_imag[j]) // SCALE_FACTOR
+            sum_imag -= (M_real[i, j] * x_imag[j] +
+                         M_imag[i, j] * x_real[j]) // SCALE_FACTOR
+
+        x_real[i] = (sum_real * SCALE_FACTOR) // M_real[i, i]
+        x_imag[i] = (sum_imag * SCALE_FACTOR) // M_real[i, i]
+
+    return x_real, x_imag
+
+
+def qtwiddleCoef(N, fixed_point=15, mytype=np.int16):
+    """Generate fixed-point twiddle coefficients for FFT.
+    N (int): Number of points in FFT.
+    fixed_point (int): Number of bits for the fractional part.
+    mytype (np.dtype): Data type for the fixed-point representation.
+
+    Returns:
+    np.ndarray: Twiddle coefficients in fixed-point representation.
+    """
+    PI = 3.14159265358979
+    twiddleCoefq15 = np.zeros((int(2 * 3 * N / 4)), dtype=mytype)
+    for i in range(int(3 * N / 4)):
+        twiddleCoefq15_cos = math.cos(i * 2 * PI / N)
+        twiddleCoefq15_sin = math.sin(i * 2 * PI / N)
+        twiddleCoefq15[2 * i] = \
+            int(round(twiddleCoefq15_cos * (2**fixed_point - 1)))
+        twiddleCoefq15[2 * i + 1] = \
+            int(round(twiddleCoefq15_sin * (2**fixed_point - 1)))
+    return twiddleCoefq15
+
+
+def bitreversal(N, R):
+    """Perform bit-reversal for FFT with radix-R decomposition.
+
+    Args:
+        N (int): Number of points in FFT.
+        R (int): Radix for FFT decomposition.
+
+    Returns:
+        np.ndarray: Flattened bit-reversal transposition table.
+    """
+    # Decompose
+    logR2 = []
+    idx = N
+    while (idx >= R):
+        logR2.append(int(math.log2(R)))
+        idx = idx // R
+    if (idx > 1):
+        logR2.append(int(math.log2(idx)))
+    # Bitreversal
+    indexes = []
+    for x in range(N):
+        result = 0
+        for bits in logR2:
+            mask = (0xffffffff >> (32 - bits))
+            result = (result << bits) | (x & mask)
+            x = x >> bits
+        indexes.append(result)
+    # Create transpositions table
+    tps = []
+    for c in Permutation.from_sequence(indexes).cyclic_form:
+        for i in range(len(c) - 1):
+            tps.append([c[i] * 8, c[-1] * 8])
+    return np.ndarray.flatten(np.array(tps))
+
+
+def q_sat(x):
+    if x > 2**15 - 1:
+        return x - 2**16
+    elif x < -2**15:
+        return x + 2**16
+    else:
+        return x
+
+
+def qchest(in_rx, in_tx, division=False, fixed_point=8, mytype=np.int16):
+    """Perform fixed-point complex channel estimation (CHEST).
+    in_rx (np.ndarray): Received signal array (complex numbers).
+    in_tx (np.ndarray): Transmitted signal array (complex numbers).
+    division (bool): Whether to perform division or multiplication.
+    Defaults to False.
+    fixed_point (int): Number of bits for the fractional part. Defaults to 8.
+    mytype (np.dtype): Data type for fixed-point representation.
+    Defaults to np.int16.
+
+    Returns:
+        np.ndarray: Resulting array in fixed-point representation.
+    """
+    SCALE_FACTOR = 2**fixed_point
+    n_rx = in_rx.size
+    n_tx = in_tx.size
+
+    # Resulting array (real and imaginary interleaved)
+    result = np.zeros(2 * (n_tx * n_rx), dtype=mytype)
+
+    for i in range(n_rx):
+        a_r = in_rx[i].real
+        a_i = in_rx[i].imag
+        for j in range(n_tx):
+            b_r = in_tx[j].real
+            b_i = in_tx[j].imag
+
+            if division:
+                # Compute data division
+                den = (2**16) // (b_r * b_r + b_i * b_i)
+                if den == 0:
+                    raise ZeroDivisionError(
+                        "Division by zero encountered in CHEST.")
+                num_r = (a_r * b_r + a_i * b_i)
+                num_i = (a_i * b_r - a_r * b_i)
+                result[2 * (i * n_tx + j)] = (num_r // den) * SCALE_FACTOR
+                result[2 * (i * n_tx + j) + 1] = (num_i // den) * SCALE_FACTOR
+            else:
+                # Compute data multiplication
+                num_r = (a_r * b_r - a_i * b_i)
+                num_i = (a_i * b_r + a_r * b_i)
+                result[2 * (i * n_tx + j)] = q_sat(num_r // SCALE_FACTOR)
+                result[2 * (i * n_tx + j) + 1] = q_sat(num_i // SCALE_FACTOR)
+
+    return result
diff --git a/software/kernels/baremetal/mempool_checks.h b/software/kernels/baremetal/mempool_checks.h
index d680764c1..110acec90 100644
--- a/software/kernels/baremetal/mempool_checks.h
+++ b/software/kernels/baremetal/mempool_checks.h
@@ -12,7 +12,7 @@
   @param[in]     TOL  floating point tolerance
   @return        none
 */
-void mempool_check_q32(int32_t *__restrict__ pRes, int32_t *__restrict__ pExp,
+void mempool_check_i32(int32_t *__restrict__ pRes, int32_t *__restrict__ pExp,
                        uint32_t NEL, int32_t TOL, bool verbose) {
   uint32_t core_id = mempool_get_core_id();
   int32_t error;
@@ -41,7 +41,7 @@ void mempool_check_q32(int32_t *__restrict__ pRes, int32_t *__restrict__ pExp,
   @param[in]     TOL  floating point tolerance
   @return        none
 */
-void mempool_check_q16(int16_t *__restrict__ pRes, int16_t *__restrict__ pExp,
+void mempool_check_i16(int16_t *__restrict__ pRes, int16_t *__restrict__ pExp,
                        uint32_t NEL, int16_t TOL, bool verbose) {
   uint32_t core_id = mempool_get_core_id();
   int16_t error;
@@ -53,7 +53,36 @@ void mempool_check_q16(int16_t *__restrict__ pRes, int16_t *__restrict__ pExp,
       error = (int16_t)(exp - res);
       bool print = ((error > TOL) || (error < (-TOL))) | verbose;
       if (print) {
-        printf("CHECK(%d): EXP = %08X - RESP = %08X\n", i, exp, res);
+        printf("CHECK(%d): EXP = %04X - RESP = %04X\n", i, exp, res);
+        ERRORS++;
+      }
+    }
+    printf("%d ERRORS out of %d CHECKS\n", ERRORS, NEL);
+  }
+  return;
+}
+
+/**
+  @brief         Check for i8 kernels.
+  @param[in]     pRes points to the result
+  @param[in]     pExp points to the expected result
+  @param[in]     NEL  number of elements to check
+  @param[in]     TOL  floating point tolerance
+  @return        none
+*/
+void mempool_check_i8(int8_t *__restrict__ pRes, int8_t *__restrict__ pExp,
+                      uint32_t NEL, int16_t TOL, bool verbose) {
+  uint32_t core_id = mempool_get_core_id();
+  int16_t error;
+  if (core_id == 0) {
+    uint32_t ERRORS = 0;
+    for (uint32_t i = 0; i < NEL; i++) {
+      int16_t exp = (int8_t)pExp[i];
+      int16_t res = (int8_t)pRes[i];
+      error = (int8_t)(exp - res);
+      bool print = ((error > TOL) || (error < (-TOL))) | verbose;
+      if (print) {
+        printf("CHECK(%d): EXP = %02X - RESP = %02X\n", i, exp, res);
         ERRORS++;
       }
     }
diff --git a/software/kernels/baremetal/mempool_dotp_i32p.h b/software/kernels/baremetal/mempool_dotp_i32p.h
new file mode 100644
index 000000000..26fbe03e9
--- /dev/null
+++ b/software/kernels/baremetal/mempool_dotp_i32p.h
@@ -0,0 +1,196 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+/* Parallel dot-product */
+void dotp_i32p(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
+               uint32_t nPE) {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t step = Len / nPE;
+  register int32_t local_sum = 0;
+  register int32_t a, b;
+  for (uint32_t i = core_id * step; i < core_id * step + step; i++) {
+    a = in_a[i];
+    b = in_b[i];
+    local_sum += a * b;
+  }
+  __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED);
+#ifdef LOG_BARRIERS
+  mempool_log_barrier(2, core_id);
+#else
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier(num_cores);
+#endif
+  return;
+}
+
+/* Parallel dot-product with loop unrolling*/
+void dotp_i32p_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
+                         uint32_t nPE) {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t step = Len / nPE;
+  uint32_t reminder = step % 4;
+  uint32_t i;
+
+  register int32_t a0 = 0, a1 = 0, a2 = 0, a3 = 0;
+  register int32_t b2 = 0, b1 = 0, b0 = 0, b3 = 0;
+  register int32_t local_sum0 = 0;
+  register int32_t local_sum1 = 0;
+  register int32_t local_sum2 = 0;
+  register int32_t local_sum3 = 0;
+  for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) {
+    a0 = in_a[i];
+    b0 = in_b[i];
+    a1 = in_a[i + 1];
+    b1 = in_b[i + 1];
+    a2 = in_a[i + 2];
+    b2 = in_b[i + 2];
+    a3 = in_a[i + 3];
+    b3 = in_b[i + 3];
+    local_sum0 += a0 * b0;
+    local_sum1 += a1 * b1;
+    local_sum2 += a2 * b2;
+    local_sum3 += a3 * b3;
+  }
+  i = core_id * step + step - reminder;
+  while (i < step) {
+    a0 = in_a[i];
+    b0 = in_b[i];
+    local_sum0 += a0 * b0;
+    i++;
+  }
+  local_sum0 += local_sum1;
+  local_sum2 += local_sum3;
+  local_sum0 += local_sum2;
+  __atomic_fetch_add(&s[0], local_sum0, __ATOMIC_RELAXED);
+#ifdef LOG_BARRIERS
+  mempool_log_barrier(2, core_id);
+#else
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier(num_cores);
+#endif
+  return;
+}
+
+/* Bynary tree reduction */
+void mempool_binary_reduction(int32_t *sum, uint32_t core_id,
+                              uint32_t num_cores) {
+
+  uint32_t idx, step = 2, previous_step = 1;
+  while (num_cores > 1) {
+    idx = (step * (core_id / step)) * BANKING_FACTOR;
+    // dump_prova(idx);
+    if (__atomic_fetch_add(&red_barrier[idx + previous_step - 1], 1,
+                           __ATOMIC_RELAXED)) {
+
+      // Reduction
+      sum[idx] += sum[idx + previous_step * BANKING_FACTOR];
+
+      // Next level of binary tree
+      __atomic_store_n(&red_barrier[idx + previous_step - 1], 0,
+                       __ATOMIC_RELAXED);
+      num_cores = num_cores / 2;
+      previous_step = step;
+      step = step * 2;
+
+    } else {
+      // Goes to sleep
+      break;
+    }
+  }
+
+  // Last core wakes everyone
+  if (num_cores == 1) {
+    __sync_synchronize();
+    wake_up_all();
+  }
+  mempool_wfi();
+
+  return;
+}
+
+/* Parallel dot-product with loop unrolling */
+/* Load and stores only in local memory */
+#define NUM_CORES_RED (16)
+void dotp_i32p_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
+                               uint32_t Len) {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t const remainder = Len % 4;
+  uint32_t const idx_stop = Len - remainder;
+
+  register int32_t a0 = 0, a1 = 0, a2 = 0, a3 = 0;
+  register int32_t b2 = 0, b1 = 0, b0 = 0, b3 = 0;
+  register int32_t local_sum0 = 0;
+  register int32_t local_sum1 = 0;
+  register int32_t local_sum2 = 0;
+  register int32_t local_sum3 = 0;
+
+  for (uint32_t i = core_id * 4; i < idx_stop; i += NUM_BANKS) {
+    a0 = in_a[i];
+    b0 = in_b[i];
+    a1 = in_a[i + 1];
+    b1 = in_b[i + 1];
+    a2 = in_a[i + 2];
+    b2 = in_b[i + 2];
+    a3 = in_a[i + 3];
+    b3 = in_b[i + 3];
+    local_sum0 += a0 * b0;
+    local_sum1 += a1 * b1;
+    local_sum2 += a2 * b2;
+    local_sum3 += a3 * b3;
+  }
+  if (core_id == ((Len % NUM_BANKS) / 4)) {
+    for (uint32_t i = Len - remainder; i < Len; i++) {
+      a0 = in_a[i];
+      b0 = in_b[i];
+      local_sum0 += a0 * b0;
+    }
+  }
+  local_sum0 += local_sum1;
+  local_sum2 += local_sum3;
+  local_sum0 += local_sum2;
+
+// A) Cores atomically fetch and add in sum variable
+// B) A global barrier synchronizes all of them
+#if defined(ATOMIC_REDUCTION)
+  __atomic_fetch_add(&s[0], local_sum0, __ATOMIC_RELAXED);
+  mempool_log_barrier(2, core_id);
+
+// A) Groups of NUM_CORES_RED cores atomically fetch and add in sum array
+// B) The last core to the reduction barrier sums the partial reductions
+#elif defined(SINGLE_CORE_REDUCTION)
+  uint32_t num_cores = mempool_get_core_count();
+  __atomic_fetch_add(
+      &s[BANKING_FACTOR * NUM_CORES_RED * (core_id / NUM_CORES_RED)],
+      local_sum0, __ATOMIC_RELAXED);
+  if ((num_cores - 1) ==
+      __atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) {
+    __atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED);
+    __sync_synchronize(); // Full memory barrier
+    uint32_t idx_red = 0;
+    local_sum0 = 0;
+    while (idx_red < NUM_BANKS) {
+      local_sum0 += s[idx_red];
+      idx_red += BANKING_FACTOR * NUM_CORES_RED;
+    }
+    s[0] = local_sum0;
+    wake_up_all();
+  }
+  mempool_wfi();
+
+// A) Cores store locally in sum array
+// B) Partial sums are reduced logarithmically
+#elif defined(BINARY_REDUCTION)
+  uint32_t num_cores = mempool_get_core_count();
+  s[core_id * 4] = local_sum0;
+  mempool_binary_reduction(s, core_id, num_cores);
+
+#endif
+
+  return;
+}
diff --git a/software/apps/baremetal/dotp_i32/dotp_single.h b/software/kernels/baremetal/mempool_dotp_i32s.h
similarity index 88%
rename from software/apps/baremetal/dotp_i32/dotp_single.h
rename to software/kernels/baremetal/mempool_dotp_i32s.h
index 58797ee80..dd562debb 100644
--- a/software/apps/baremetal/dotp_i32/dotp_single.h
+++ b/software/kernels/baremetal/mempool_dotp_i32s.h
@@ -5,12 +5,11 @@
 // Author: Marco Bertuletti, ETH Zurich
 
 /* Single-core dot-product */
-void dotp_single(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) {
+void dotp_i32s(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) {
 
   uint32_t core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
   if (core_id == 0) {
-
     mempool_start_benchmark();
     // Kernel execution
     register int32_t local_sum = 0;
@@ -18,7 +17,6 @@ void dotp_single(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) {
     do {
       local_sum += ((*in_a++) * (*in_b++));
     } while (in_a < end);
-
     *s = local_sum;
     mempool_stop_benchmark();
   }
@@ -26,17 +24,15 @@ void dotp_single(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) {
 }
 
 /* Single-core dot-product unrolled4 */
-void dotp_single_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
-                           uint32_t Len) {
+void dotp_i32s_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
+                         uint32_t Len) {
 
   uint32_t core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
   if (core_id == 0) {
-
     mempool_start_benchmark();
     uint32_t reminder = Len % 4;
     uint32_t i = 0;
-
     int32_t a0 = 0, b0 = 0, a1 = 0, b1 = 0, a2 = 0, b2 = 0, a3 = 0, b3 = 0;
     register int32_t local_sum_1 = 0;
     register int32_t local_sum_2 = 0;
@@ -70,5 +66,4 @@ void dotp_single_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
     mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
-  // mempool_log_barrier(2, core_id);
 }
diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk
index 69d309158..52d86c6d1 100644
--- a/software/runtime/runtime.mk
+++ b/software/runtime/runtime.mk
@@ -172,11 +172,8 @@ OMP_RUNTIME := $(addsuffix .o,$(shell find $(OMP_DIR) -name "*.c"))
 %.ld: %.ld.c
 	$(RISCV_CC) -P -E $(DEFINES) $< -o $@
 
-%.h: %.args
-	cat $< | xargs $(python) $(MEMPOOL_DIR)/scripts/gen_data.py --clangformat=$(LLVM_INSTALL_DIR)/bin/clang-format -o $@
-
-%.h: %.py
-	$(python) $<
+data_%.h: $(DATA_DIR)/gendata_params.hjson
+	$(python) $(DATA_DIR)/gendata_header.py --app_name $* --params $(DATA_DIR)/gendata_params.hjson
 
 # Bootrom
 %.elf: %.S $(ROOT_DIR)/bootrom.ld $(LINKER_SCRIPT)
diff --git a/software/tests/baremetal/Makefile b/software/tests/baremetal/Makefile
index 5efba8e1b..71dac7ce9 100644
--- a/software/tests/baremetal/Makefile
+++ b/software/tests/baremetal/Makefile
@@ -16,8 +16,6 @@ RUNTIME_DIR := $(abspath $(SOFTWARE_DIR)/runtime)
 include $(RUNTIME_DIR)/runtime.mk
 
 TESTS := $(patsubst $(TESTS_DIR)/%/main.c,%,$(shell find $(TESTS_DIR) -name "main.c"))
-DATA := $(patsubst %.args,%.h,$(shell find $(TESTS_DIR) -name "data.args"))
-ALLPYS := $(patsubst %.py,%.h,$(wildcard $(DATA_DIR)/*.py))
 BINARIES := $(addprefix $(BIN_DIR)/,$(TESTS))
 
 # Make all applications
@@ -26,7 +24,7 @@ all: $(TESTS)
 $(TESTS): % : $(BIN_DIR)/% $(TESTS_DIR)/Makefile $(shell find $(RUNTIME_DIR)/**.{S,c,h,ld} -type f)
 
 .PHONY: $(BINARIES)
-$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) $(DATA) $(ALLPYS) update_opcodes
+$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) data_%.h update_opcodes
 	mkdir -p $(dir $@)
 	$(RISCV_CC) -Iinclude $(RISCV_LDFLAGS) -o $@ $< $(RUNTIME) -T$(RUNTIME_DIR)/link.ld
 	$(RISCV_OBJDUMP) $(RISCV_OBJDUMP_FLAGS) -D $@ > $@.dump
diff --git a/software/tests/baremetal/fence/data.args b/software/tests/baremetal/fence/data.args
deleted file mode 100644
index f52fe46db..000000000
--- a/software/tests/baremetal/fence/data.args
+++ /dev/null
@@ -1 +0,0 @@
---variable=l2_data --size=12288
diff --git a/software/tests/baremetal/fence/main.c b/software/tests/baremetal/fence/main.c
index 82f493c12..934cddfea 100644
--- a/software/tests/baremetal/fence/main.c
+++ b/software/tests/baremetal/fence/main.c
@@ -7,7 +7,7 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "data.h"
+#include "data_fence.h"
 #include "dma.h"
 #include "encoding.h"
 #include "mempool_dma_frontend.h"
diff --git a/software/tests/baremetal/memcpy/data.args b/software/tests/baremetal/memcpy/data.args
deleted file mode 100644
index 21fbc935a..000000000
--- a/software/tests/baremetal/memcpy/data.args
+++ /dev/null
@@ -1 +0,0 @@
---variable=l2_data --size=2048
\ No newline at end of file
diff --git a/software/tests/baremetal/memcpy/main.c b/software/tests/baremetal/memcpy/main.c
index 4e07a9a30..6ca336cbf 100644
--- a/software/tests/baremetal/memcpy/main.c
+++ b/software/tests/baremetal/memcpy/main.c
@@ -7,7 +7,7 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "data.h"
+#include "data_memcpy.h"
 #include "dma.h"
 #include "encoding.h"
 #include "mempool_dma_frontend.h"