From dea872f6fb96294d2d402046a88a6dbb38c97b56 Mon Sep 17 00:00:00 2001
From: mbertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Thu, 19 Dec 2024 11:52:17 +0100
Subject: [PATCH] [software] Add explanation for the use of defines

---
 software/apps/baremetal/Makefile              |  19 +--
 software/apps/baremetal/axpy_f16/main.c       |   1 -
 software/apps/baremetal/axpy_f32/main.c       |   1 -
 .../apps/baremetal/cfft_radix2_q16/main.c     |   1 -
 .../apps/baremetal/cfft_radix4_f16/main.c     |  59 ++++----
 .../apps/baremetal/cfft_radix4_q16/main.c     |  59 ++++----
 software/apps/baremetal/chest_f16/main.c      |   8 ++
 software/apps/baremetal/chest_q16/main.c      |   8 ++
 software/apps/baremetal/cholesky_f16/main.c   |   9 ++
 software/apps/baremetal/cholesky_q32/main.c   |  27 ++--
 software/apps/baremetal/cmatmul_f16/main.c    |  15 +-
 software/apps/baremetal/cmatmul_q16/main.c    |   8 ++
 software/apps/baremetal/dotp_f16/main.c       |  15 --
 software/apps/baremetal/dotp_f32/main.c       |  13 --
 software/apps/baremetal/dotp_i32/main.c       |  15 --
 software/apps/baremetal/matmul_f16/main.c     |   8 +-
 software/apps/baremetal/matmul_f32/main.c     |   9 +-
 software/apps/baremetal/mimo_mmse_f16/main.c  |  46 +++++--
 software/apps/baremetal/mimo_mmse_f32/main.c  |  11 ++
 software/apps/baremetal/mimo_mmse_f8/main.c   |  13 +-
 software/apps/baremetal/mimo_mmse_q16/main.c  |   8 +-
 software/apps/baremetal/ofdm_f16/main.c       |  29 ++--
 .../kernels/baremetal/mempool_chest_q16.h     |   2 +-
 .../kernels/baremetal/mempool_cholesky_f16s.h |   7 +-
 .../kernels/baremetal/mempool_cholesky_f32s.h |   5 +-
 .../kernels/baremetal/mempool_cholesky_q16s.h |   1 -
 .../kernels/baremetal/mempool_cholesky_q32p.h |  93 +++++++------
 .../kernels/baremetal/mempool_cholesky_q32s.h |   2 +-
 .../kernels/baremetal/mempool_cmatmul_f16.h   |   8 +-
 .../kernels/baremetal/mempool_cmatmul_q16.h   |   3 +-
 software/kernels/baremetal/mempool_dotp_f16.h |  10 ++
 software/kernels/baremetal/mempool_dotp_f32.h |  10 ++
 software/kernels/baremetal/mempool_dotp_i32.h |  12 ++
 .../baremetal/mempool_linearsolver_f16s.h     |   6 +-
 .../baremetal/mempool_linearsolver_f32s.h     |   4 +-
 .../baremetal/mempool_linearsolver_q32p.h     | 128 +++++++++---------
 .../baremetal/mempool_linearsolver_q32s.h     |  10 +-
 .../kernels/baremetal/mempool_matmul_f32.h    |   2 +
 .../baremetal/mempool_mimo_mmse_f16s.h        |   9 +-
 .../baremetal/mempool_mimo_mmse_f32p.h        |   3 +-
 .../mempool_radix4_cfft_butterfly_f16.h       |  16 +--
 .../mempool_radix4_cfft_butterfly_q16.h       |  16 +--
 .../baremetal/mempool_radix4_cfft_f16p.h      |  33 +++--
 .../baremetal/mempool_radix4_cfft_q16p.h      |  50 +++----
 software/runtime/runtime.mk                   |   3 +-
 45 files changed, 452 insertions(+), 363 deletions(-)

diff --git a/software/apps/baremetal/Makefile b/software/apps/baremetal/Makefile
index 9511f7869..bb640dfde 100644
--- a/software/apps/baremetal/Makefile
+++ b/software/apps/baremetal/Makefile
@@ -20,18 +20,13 @@ APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find $(APPS_DIR) -name "main.c
 BINARIES := $(addprefix $(BIN_DIR)/,$(APPS))
 ALL := $(APPS)
 
-FP_APPS := axpy_f16 axpy_f32
-FP_APPS += cfft_radix4_f16 chest_f16 cholesky_f16
-FP_APPS += cmatmul_f16 matmul_f16 matmul_f32
-FP_APPS += dotp_f16 dotp_f32
-FP_APPS += mimo_mmse_f32 mimo_mmse_f16 mimo_mmse_f8 ofdm_f16
-
-I_APPS := synth_i32
-I_APPS += cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32
-I_APPS += cmatmul_q16 mimo_mmse_q16
-
-ALL_GCC := $(filter-out $(FP_APPS), $(ALL))
-ALL_LLVM := $(filter-out $(I_APPS), $(ALL))
+FP_SUFFIXES := f16 f32 f8
+I_SUFFIXES := q16 q32 i16 i32 i8
+I_APPS := $(foreach suf, $(FP_SUFFIXES), $(filter %_$(suf), $(ALL)))
+FP_APPS := $(foreach suf, $(I_SUFFIXES), $(filter %_$(suf), $(ALL)))
+# Filter out applications
+ALL_GCC := $(filter-out $(I_APPS), $(ALL))
+ALL_LLVM := $(filter-out $(FP_APPS), $(ALL))
 
 # Make all applications
 all: $(ALL_GCC)
diff --git a/software/apps/baremetal/axpy_f16/main.c b/software/apps/baremetal/axpy_f16/main.c
index 1795e9059..8bcb38296 100644
--- a/software/apps/baremetal/axpy_f16/main.c
+++ b/software/apps/baremetal/axpy_f16/main.c
@@ -15,7 +15,6 @@
 #include "synchronization.h"
 
 #include "data_axpy_f16.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // Vectors for kernel computation
 __fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
diff --git a/software/apps/baremetal/axpy_f32/main.c b/software/apps/baremetal/axpy_f32/main.c
index 34ead109c..cb3f1d8a9 100644
--- a/software/apps/baremetal/axpy_f32/main.c
+++ b/software/apps/baremetal/axpy_f32/main.c
@@ -15,7 +15,6 @@
 #include "synchronization.h"
 
 #include "data_axpy_f32.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // Vectors for kernel computation
 float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
diff --git a/software/apps/baremetal/cfft_radix2_q16/main.c b/software/apps/baremetal/cfft_radix2_q16/main.c
index e23fb929e..25510b184 100644
--- a/software/apps/baremetal/cfft_radix2_q16/main.c
+++ b/software/apps/baremetal/cfft_radix2_q16/main.c
@@ -19,7 +19,6 @@
 #include "synchronization.h"
 
 #include "data_cfft_radix2_q16.h"
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
 
 /* CFFT mempool libraries */
 #include "baremetal/mempool_cfft_q16_bitreversal.h"
diff --git a/software/apps/baremetal/cfft_radix4_f16/main.c b/software/apps/baremetal/cfft_radix4_f16/main.c
index b06ae3189..518c06add 100644
--- a/software/apps/baremetal/cfft_radix4_f16/main.c
+++ b/software/apps/baremetal/cfft_radix4_f16/main.c
@@ -19,25 +19,30 @@
 
 /* CFFT data libraries */
 #include "data_cfft_radix4_f16.h"
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
+#define MAX_COL (NUM_BANKS / (N_CSAMPLES / 4))
 
-/* CHOOSE ONE */
-#define PARALLEL // Parallel FFT not "memory-aware".
-// #define FOLDED // Parallel FFT with "memory-aware" load/store.
-//#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
+/*
+======================
+Parameters and defines
 
-// Bitreversal index from table.
+PARALLEL: When defined runs parallel FFT.
+FOLDED: When defined runs parallel FFT with folded inputs in memory.
+SCHEDULED: When defined runs multiple parallel folded-inputs FFTs.
+N_FFTs_ROW: When the FFT is scheduled defines the number of FFTs run sequntially
+by each core N_FFTs_COL:
+
+BITREVERSETABLE: When defined bitreversal indeces are fetched from a table, else
+they are computed by cores. FOLDED_TWIDDLES: When FOLDED is defined it can be
+defined to also fold the twiddle factors in memory.
+*/
+
+#define PARALLEL
 #define BITREVERSETABLE
-// Also the twiddles have "memory-aware" load/stores.
-// #define FOLDED_TWIDDLES
 
-// Independent FFTs scheduled on one row (default 1).
-#define N_FFTs_ROW 1
-// Independent FFTs scheduled on columns (default 1).
-#define N_FFTs_COL 1
+#define N_FFTs_ROW (1)
+#define N_FFTs_COL (1)
 #if (N_FFTs_COL > MAX_COL)
-#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
+#error Parallelization not supporting N_FFTs_COL > [NUM_BANKS / (N_CSAMPLES / 4)]
 #endif
 
 #include "baremetal/mempool_cfft_q16_bitreversal.h"
@@ -59,16 +64,16 @@ uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
 #endif
 
 #if (defined(SCHEDULED) || defined(FOLDED))
-__fp16 l1_pSrc[N_FFTs_ROW * 8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-__fp16 l1_pDst[N_FFTs_ROW * 8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-__fp16 l1_twiddleCoef_f16_src[8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-__fp16 l1_twiddleCoef_f16_dst[8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+__fp16 l1_pSrc[N_FFTs_ROW * 8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+__fp16 l1_pDst[N_FFTs_ROW * 8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_src[8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_dst[8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 #endif
 
 int main() {
@@ -96,7 +101,7 @@ int main() {
   if (core_id == 0) {
     for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
       for (uint32_t i = 0; i < N_FFTs_COL; i++) {
-        dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS),
+        dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * NUM_BANKS),
                             l2_pSrc, N_CSAMPLES * sizeof(int32_t));
       }
     }
@@ -113,9 +118,11 @@ int main() {
     for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
       *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL)] =
           *(v2h *)&l2_twiddleCoef_f16[2 * i];
-      *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] =
+      *(v2h *)&l1_twiddleCoef_f16_src[2 *
+                                      (i + j * N_WORDS_COL + 1 * NUM_BANKS)] =
           *(v2h *)&l2_twiddleCoef_f16[2 * (i * 2U)];
-      *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] =
+      *(v2h *)&l1_twiddleCoef_f16_src[2 *
+                                      (i + j * N_WORDS_COL + 2 * NUM_BANKS)] =
           *(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)];
     }
   }
diff --git a/software/apps/baremetal/cfft_radix4_q16/main.c b/software/apps/baremetal/cfft_radix4_q16/main.c
index 08ed80e9b..bebb66059 100644
--- a/software/apps/baremetal/cfft_radix4_q16/main.c
+++ b/software/apps/baremetal/cfft_radix4_q16/main.c
@@ -19,23 +19,30 @@
 
 /* CFFT data libraries */
 #include "data_cfft_radix4_q16.h"
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
+#define MAX_COL (NUM_BANKS / (N_CSAMPLES / 4))
 
-/* CHOOSE ONE */
-//#define SINGLE // Single core FFT.
-//#define PARALLEL // Parallel FFT not "memory-aware".
-//#define FOLDED // Parallel FFT with "memory-aware" load/store.
-#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
+/*
+======================
+Parameters and defines
+
+PARALLEL: When defined runs parallel FFT.
+FOLDED: When defined runs parallel FFT with folded inputs in memory.
+SCHEDULED: When defined runs multiple parallel folded-inputs FFTs.
+N_FFTs_ROW: When the FFT is scheduled defines the number of FFTs run sequntially
+by each core N_FFTs_COL:
+
+BITREVERSETABLE: When defined bitreversal indeces are fetched from a table, else
+they are computed by cores. FOLDED_TWIDDLES: When FOLDED is defined it can be
+defined to also fold the twiddle factors in memory.
+*/
 
-// Bitreversal index from table.
+#define PARALLEL
 #define BITREVERSETABLE
-// Independent FFTs scheduled on one row (default 1).
-#define N_FFTs_ROW 2
-// Independent FFTs scheduled on columns (default 1).
-#define N_FFTs_COL 2
+
+#define N_FFTs_ROW (1)
+#define N_FFTs_COL (1)
 #if (N_FFTs_COL > MAX_COL)
-#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
+#error Parallelization not supporting N_FFTs_COL > [NUM_BANKS / (N_CSAMPLES / 4)]
 #endif
 // Also the twiddles have "memory-aware" load/stores.
 #define FOLDED_TWIDDLES
@@ -60,16 +67,16 @@ uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
 #endif
 
 #if (defined(SCHEDULED) || defined(FOLDED))
-int16_t l1_pSrc[N_FFTs_ROW * 8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-int16_t l1_pDst[N_FFTs_ROW * 8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-int16_t l1_twiddleCoef_q16_src[8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-int16_t l1_twiddleCoef_q16_dst[8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+int16_t l1_pSrc[N_FFTs_ROW * 8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+int16_t l1_pDst[N_FFTs_ROW * 8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+int16_t l1_twiddleCoef_q16_src[8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+int16_t l1_twiddleCoef_q16_dst[8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 #endif
 
 int main() {
@@ -97,7 +104,7 @@ int main() {
   if (core_id == 0) {
     for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
       for (uint32_t i = 0; i < N_FFTs_COL; i++) {
-        dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS),
+        dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * NUM_BANKS),
                             l2_pSrc, N_CSAMPLES * sizeof(int32_t));
       }
     }
@@ -112,9 +119,11 @@ int main() {
     for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
       *(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL)] =
           *(v2s *)&l2_twiddleCoef_q16[2 * i];
-      *(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] =
+      *(v2s *)&l1_twiddleCoef_q16_src[2 *
+                                      (i + j * N_WORDS_COL + 1 * NUM_BANKS)] =
           *(v2s *)&l2_twiddleCoef_q16[2 * (i * 2U)];
-      *(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] =
+      *(v2s *)&l1_twiddleCoef_q16_src[2 *
+                                      (i + j * N_WORDS_COL + 2 * NUM_BANKS)] =
           *(v2s *)&l2_twiddleCoef_q16[2 * (i * 3U)];
     }
   }
diff --git a/software/apps/baremetal/chest_f16/main.c b/software/apps/baremetal/chest_f16/main.c
index e0feb90c7..304313788 100644
--- a/software/apps/baremetal/chest_f16/main.c
+++ b/software/apps/baremetal/chest_f16/main.c
@@ -19,6 +19,14 @@
 #include "baremetal/mempool_chest_f16.h"
 #include "data_chest_f16.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core Channel Estimation.
+PARALLEL: When defined runs parallel Channel Estimation.
+*/
+
 //#define SINGLE
 #define PARALLEL
 
diff --git a/software/apps/baremetal/chest_q16/main.c b/software/apps/baremetal/chest_q16/main.c
index 572b12de0..6f7a73938 100644
--- a/software/apps/baremetal/chest_q16/main.c
+++ b/software/apps/baremetal/chest_q16/main.c
@@ -19,6 +19,14 @@
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_chest_q16.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core Channel Estimation.
+PARALLEL: When defined runs parallel Channel Estimation.
+*/
+
 #define PARALLEL
 
 int16_t l1_PilotTX[2 * N_TX * N_SAMPLES]
diff --git a/software/apps/baremetal/cholesky_f16/main.c b/software/apps/baremetal/cholesky_f16/main.c
index 6d1c26ff2..10baa6a81 100644
--- a/software/apps/baremetal/cholesky_f16/main.c
+++ b/software/apps/baremetal/cholesky_f16/main.c
@@ -17,6 +17,15 @@
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_cholesky_f16s.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core Cholesky Decomposition.
+PARALLEL: When defined runs parallel Cholesky Decomposition.
+FOLDED: When defined 1 intermediate results are folded in memory.
+*/
+
 #define SINGLE
 #define FOLDED (0)
 
diff --git a/software/apps/baremetal/cholesky_q32/main.c b/software/apps/baremetal/cholesky_q32/main.c
index 64fbf3b2f..161d17b30 100644
--- a/software/apps/baremetal/cholesky_q32/main.c
+++ b/software/apps/baremetal/cholesky_q32/main.c
@@ -11,7 +11,6 @@
 #include "synchronization.h"
 
 #define HALF (1023)
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
 #define FIX_DIV(a, b) ((int32_t)((a << FIXED_POINT) / b))
 #define FIX_MUL(a, b) ((int32_t)((a * b + HALF) >> FIXED_POINT))
 #define ABS(a) (a > 0 ? a : -a)
@@ -31,18 +30,19 @@
 #define N_COL 1
 #define N_ROW 1
 int32_t l1_A[matrix_N * matrix_N]
-    __attribute__((aligned(N_BANKS), section(".l1")));
+    __attribute__((aligned(NUM_BANKS), section(".l1")));
 int32_t l1_L[matrix_N * matrix_N]
-    __attribute__((aligned(N_BANKS), section(".l1")));
-int32_t l1_y[matrix_N] __attribute__((aligned(N_BANKS), section(".l1")));
+    __attribute__((aligned(NUM_BANKS), section(".l1")));
+int32_t l1_y[matrix_N] __attribute__((aligned(NUM_BANKS), section(".l1")));
 #else
-int32_t l1_AA[matrix_N * N_BANKS]
-    __attribute__((aligned(N_BANKS), section(".l1_prio")));
-int32_t l1_LL[N_ROW * matrix_N * N_BANKS]
-    __attribute__((aligned(N_BANKS), section(".l1_prio")));
-int32_t l1_LR[N_ROW * matrix_N * N_BANKS]
-    __attribute__((aligned(N_BANKS), section(".l1_prio")));
-int32_t l1_yy[N_BANKS] __attribute__((aligned(N_BANKS), section(".l1_prio")));
+int32_t l1_AA[matrix_N * NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+int32_t l1_LL[N_ROW * matrix_N * NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+int32_t l1_LR[N_ROW * matrix_N * NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+int32_t l1_yy[NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 #endif
 
 int main() {
@@ -58,11 +58,12 @@ int main() {
       for (uint32_t idx_col = 0; idx_col < N_COL; idx_col++) {
         l1_yy[idx_col * matrix_N + i] = l2_y[i];
         for (uint32_t j = 0; j < matrix_N; j++) {
-          l1_AA[idx_col * matrix_N + i * N_BANKS + j] = l2_A[i * matrix_N + j];
+          l1_AA[idx_col * matrix_N + i * NUM_BANKS + j] =
+              l2_A[i * matrix_N + j];
         }
       }
     }
-    for (uint32_t i = 0; i < N_ROW * matrix_N * N_BANKS; i++) {
+    for (uint32_t i = 0; i < N_ROW * matrix_N * NUM_BANKS; i++) {
       l1_LL[i] = 0;
       l1_LR[i] = 0;
     }
diff --git a/software/apps/baremetal/cmatmul_f16/main.c b/software/apps/baremetal/cmatmul_f16/main.c
index aa2ed55a6..727dba7ca 100644
--- a/software/apps/baremetal/cmatmul_f16/main.c
+++ b/software/apps/baremetal/cmatmul_f16/main.c
@@ -19,7 +19,18 @@
 
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_cmatmul_f16.h"
-#define PARALLEL_4x4
+
+/*
+======================
+Parameters and defines
+
+SINGLE_2x2: Single-core matmul on 2x2 tiles.
+PARALLEL_2x2: Parallel matmul on 2x2 C-tiles.
+PARALLEL_2x4: Parallel matmul on 4x4 C-tiles.
+PARALLEL_4x4: Parallel matmul on 4x4 C-tiles.
+PARALLEL_4x4_COPIES_A: Parallel matmul on 4x4 C-tiles, compies of A in memory to
+avoid banking conflicts.
+*/
 
 #if defined(PARALLEL_4x4_COPIES_A)
 __fp16 matrix_a[2 * (BANKING_FACTOR * NUM_CORES)]
@@ -51,7 +62,7 @@ int main() {
   // Wait at barrier until everyone is ready
   mempool_barrier(num_cores);
 
-#if defined(SINGLE_CORE)
+#if defined(SINGLE_2x2)
   // Execute function to test.
   if (core_id == 0) {
     mempool_start_benchmark();
diff --git a/software/apps/baremetal/cmatmul_q16/main.c b/software/apps/baremetal/cmatmul_q16/main.c
index 0dcffbfc7..37089fd5b 100644
--- a/software/apps/baremetal/cmatmul_q16/main.c
+++ b/software/apps/baremetal/cmatmul_q16/main.c
@@ -16,6 +16,14 @@
 #include "baremetal/mempool_cmatmul_q16.h"
 #include "data_cmatmul_q16.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core matmul.
+PARALLEL: When defined runs parallel matmul.
+*/
+
 #define PARALLEL
 #define dim_M (matrix_M)
 #define dim_N (matrix_N)
diff --git a/software/apps/baremetal/dotp_f16/main.c b/software/apps/baremetal/dotp_f16/main.c
index 2091f0336..3b8b272b9 100644
--- a/software/apps/baremetal/dotp_f16/main.c
+++ b/software/apps/baremetal/dotp_f16/main.c
@@ -14,9 +14,6 @@
 #include "synchronization.h"
 
 #include "data_dotp_f16.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
-// #define SINGLE_CORE_REDUCTION
-#define BINARY_REDUCTION
 
 // Vectors for kernel computation
 __fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
@@ -47,18 +44,6 @@ int main() {
   }
   mempool_barrier(num_cores);
 
-  //  // SINGLE-CORE
-  //  time_init = mempool_get_timer();
-  //  dotp_f16s(l1_X, l1_Y, sum, array_N);
-  //  // dotp_f16s_unrolled4(l1_X, l1_Y, sum, array_N);
-  //  time_end = mempool_get_timer();
-
-  //  // PARALLEL
-  //  time_init = mempool_get_timer();
-  //  dotp_f16vecp_unrolled4(l1_X, l1_Y, sum, array_N, num_cores);
-  //  // dotp_f16p(l1_X, l1_Y, sum, array_N, num_cores);
-  //  time_end = mempool_get_timer();
-
   // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
   dotp_f16vecp_local_unrolled4(l1_X, l1_Y, sum, array_N);
diff --git a/software/apps/baremetal/dotp_f32/main.c b/software/apps/baremetal/dotp_f32/main.c
index 3507795b1..e1a87b6b8 100644
--- a/software/apps/baremetal/dotp_f32/main.c
+++ b/software/apps/baremetal/dotp_f32/main.c
@@ -15,9 +15,6 @@
 #include "synchronization.h"
 
 #include "data_dotp_f32.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
-// #define SINGLE_CORE_REDUCTION
-#define BINARY_REDUCTION
 
 // Vectors for kernel computation
 float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
@@ -47,16 +44,6 @@ int main() {
   }
   mempool_barrier(num_cores);
 
-  //    // SINGLE-CORE
-  //    time_init = mempool_get_timer();
-  //    dotp_f32s_unrolled4(l1_A, l1_B, sum, array_N);
-  //    time_end = mempool_get_timer();
-
-  //   // PARALLEL
-  //   time_init = mempool_get_timer();
-  //   dotp_f32p(l1_A, l1_B, sum, array_N, num_cores);
-  //   time_end = mempool_get_timer();
-
   // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
   dotp_f32p_local_unrolled4(l1_X, l1_Y, sum, array_N);
diff --git a/software/apps/baremetal/dotp_i32/main.c b/software/apps/baremetal/dotp_i32/main.c
index ee2e2ea52..8f6490ee2 100644
--- a/software/apps/baremetal/dotp_i32/main.c
+++ b/software/apps/baremetal/dotp_i32/main.c
@@ -15,11 +15,6 @@
 #include "synchronization.h"
 
 #include "data_dotp_i32.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
-#define LOG_BARRIERS
-// #define ATOMIC_REDUCTION
-// #define SINGLE_CORE_REDUCTION
-#define BINARY_REDUCTION
 
 // Vectors for kernel computation
 int32_t l1_X[array_N] __attribute__((aligned(array_N), section(".l1_prio")));
@@ -49,16 +44,6 @@ int main() {
   }
   mempool_barrier(num_cores);
 
-  //  // SINGLE-CORE
-  //  time_init = mempool_get_timer();
-  //  dotp_i32s_unrolled4(l1_A, l1_B, sum, array_N);
-  //  time_end = mempool_get_timer();
-
-  //  // PARALLEL
-  //  time_init = mempool_get_timer();
-  //  dotp_i32p(l1_A, l1_B, sum, array_N, num_cores);
-  //  time_end = mempool_get_timer();
-
   // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
   dotp_i32p_local_unrolled4(l1_X, l1_Y, sum, array_N);
diff --git a/software/apps/baremetal/matmul_f16/main.c b/software/apps/baremetal/matmul_f16/main.c
index 99a0269cc..9964257ca 100644
--- a/software/apps/baremetal/matmul_f16/main.c
+++ b/software/apps/baremetal/matmul_f16/main.c
@@ -17,7 +17,13 @@
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_matmul_f16.h"
 
-#define PARALLEL
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core matmul.
+PARALLEL: When defined runs parallel matmul.
+*/
 
 __fp16 matrix_a[matrix_M * matrix_N]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
diff --git a/software/apps/baremetal/matmul_f32/main.c b/software/apps/baremetal/matmul_f32/main.c
index d3d7622db..ba9165ed1 100644
--- a/software/apps/baremetal/matmul_f32/main.c
+++ b/software/apps/baremetal/matmul_f32/main.c
@@ -17,8 +17,13 @@
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_matmul_f32.h"
 
-#define PARALLEL
-#define ASM
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core matmul.
+PARALLEL: When defined runs parallel matmul.
+*/
 
 float matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
 float matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
diff --git a/software/apps/baremetal/mimo_mmse_f16/main.c b/software/apps/baremetal/mimo_mmse_f16/main.c
index 80309a1e0..b1ef24451 100644
--- a/software/apps/baremetal/mimo_mmse_f16/main.c
+++ b/software/apps/baremetal/mimo_mmse_f16/main.c
@@ -18,25 +18,45 @@
 #include "baremetal/mempool_mimo_mmse_f16s.h"
 
 #include "data_mimo_mmse_f16.h"
-#define ZF (0)   // When asserted use zero-forcing
-#define FOLD (1) // When asserted fold matrices in memory
-#define NUM_BANKS (BANKING_FACTOR * NUM_CORES)
+
+/*
+======================
+Parameters and defines
+
+DOUBLE_BUFFERING: When defined benchmark double buffered MIMO-MMSE, including
+L2-L1 transfers.
+
+For MIMO-MMSE without L2-L1 transfers:
+PARALLEL: When defined benchmark parallel MIMO-MMSE.
+SINGLE: When defined benchmark single-core MIMO-MMSE.
+VEC: When defined benchmark SIMD-vectorized kernels.
+ZF: When defined 1 use zero forcing detector.
+FOLD: When defined 1 fold matrices in memory.
+
+For MIMO-MMSE with L2-L1 transfers:
+DMA_TRANSFER1: When defined transfer inputs for next round at the beginning of
+computation. DMA_TRANSFER2: When defined transfer inputs for next round after
+Hermitian computation. N_ROUNDS: Define number of rounds of Double-Buffering.
+*/
+
+#define ZF (0)
+#define FOLD (1)
 #define PARALLEL
 #define VEC
 
+#ifndef DOUBLE_BUFFERING
+
 /**********************************************************
  **********************************************************
-  _   _  ___        _     _ _____                     __
- | \ | |/ _ \      | |   / |_   _| __ __ _ _ __  ___ / _|
- |  \| | | | |_____| |   | | | || '__/ _` | '_ \/ __| |_
- | |\  | |_| |_____| |___| | | || | | (_| | | | \__ \  _|
- |_| \_|\___/      |_____|_| |_||_|  \__,_|_| |_|___/_|(_)
+  _   _  ___        _____                     __
+ | \ | |/ _ \      |_   _| __ __ _ _ __  ___ / _|
+ |  \| | | | |_____  | || '__/ _` | '_ \/ __| |_
+ | |\  | |_| |_____  | || | | (_| | | | \__ \  _|
+ |_| \_|\___/        |_||_|  \__,_|_| |_|___/_|(_)
 
 ***********************************************************
 ***********************************************************/
 
-#ifndef DOUBLE_BUFFERING
-
 #if FOLD
 #define NUM_ROW (1 + ((N_ITR * N_TX - 1) / NUM_BANKS))
 #define NUM_COL (NUM_BANKS / N_TX)
@@ -193,6 +213,8 @@ int main() {
   return 0;
 }
 
+#else
+
 /**********************************************************
  **********************************************************
   ____  __  __    _       _____                     __
@@ -204,10 +226,6 @@ int main() {
 ***********************************************************
 ***********************************************************/
 
-#else
-#define N_ROUNDS (1)
-#define DMA_TRANSFER1
-
 // Inputs-Outputs even double-buffering rounds
 __fp16 l1A_H[2 * N_TX * N_RX * N_ITR]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
diff --git a/software/apps/baremetal/mimo_mmse_f32/main.c b/software/apps/baremetal/mimo_mmse_f32/main.c
index d243754fc..fb054e4e0 100644
--- a/software/apps/baremetal/mimo_mmse_f32/main.c
+++ b/software/apps/baremetal/mimo_mmse_f32/main.c
@@ -21,6 +21,17 @@
 
 #include "data_mimo_mmse_f32.h"
 
+/*
+======================
+Parameters and defines
+
+PARALLEL: When defined benchmark parallel MIMO-MMSE.
+SINGLE: When defined benchmark single-core MIMO-MMSE.
+PARALLEL_HERMITIAN: When defined the Hermitian is finely-grained parallelized
+over a group of cores. ZF: When defined 1 use zero forcing detector. FOLD: When
+defined 1 fold matrices in memory.
+*/
+
 #define SINGLE
 #define ZF (0)
 #define FOLD (0)
diff --git a/software/apps/baremetal/mimo_mmse_f8/main.c b/software/apps/baremetal/mimo_mmse_f8/main.c
index c5d1cd77e..006dbf83b 100644
--- a/software/apps/baremetal/mimo_mmse_f8/main.c
+++ b/software/apps/baremetal/mimo_mmse_f8/main.c
@@ -18,9 +18,20 @@
 #include "baremetal/mempool_mimo_mmse_f8s.h"
 
 #include "data_mimo_mmse_f8.h"
+
+/*
+======================
+Parameters and defines
+
+PARALLEL: When defined benchmark parallel MIMO-MMSE.
+SINGLE: When defined benchmark single-core MIMO-MMSE.
+VEC: When defined benchmark SIMD-vectorized kernels.
+ZF: When defined 1 use zero forcing detector.
+FOLD: When defined 1 fold matrices in memory.
+*/
+
 #define ZF (0)   // When asserted use zero-forcing
 #define FOLD (0) // When asserted fold matrixes in memory
-#define NUM_BANKS (BANKING_FACTOR * NUM_CORES)
 #define PARALLEL
 #define VEC
 
diff --git a/software/apps/baremetal/mimo_mmse_q16/main.c b/software/apps/baremetal/mimo_mmse_q16/main.c
index 9bcb5e9db..8e2b557a4 100644
--- a/software/apps/baremetal/mimo_mmse_q16/main.c
+++ b/software/apps/baremetal/mimo_mmse_q16/main.c
@@ -16,7 +16,13 @@
 #include "baremetal/mempool_linearsolver_q16s.h"
 #include "baremetal/mempool_mimo_mmse_q16s.h"
 
-#define PARALLEL
+/*
+======================
+Parameters and defines
+
+PARALLEL: When defined benchmark parallel MIMO-MMSE.
+SINGLE: When defined benchmark single-core MIMO-MMSE.
+*/
 
 int16_t l1_H[2 * N_TX * N_RX * N_ITR]
     __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
diff --git a/software/apps/baremetal/ofdm_f16/main.c b/software/apps/baremetal/ofdm_f16/main.c
index 264768199..3cf04dbed 100644
--- a/software/apps/baremetal/ofdm_f16/main.c
+++ b/software/apps/baremetal/ofdm_f16/main.c
@@ -18,7 +18,6 @@
 #include "synchronization.h"
 
 #include "data_ofdm_f16.h"
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // CFFT Parameters
 #define SCHEDULED
@@ -28,7 +27,7 @@
 #define N_FFTs_COL 4
 #define N_FFTs_ROW (N_RX / N_FFTs_COL)
 // CMATMUL Parameters
-#define NUM_COPIES (N_BANKS / (N_BEAMS * N_RX))
+#define NUM_COPIES (NUM_BANKS / (N_BEAMS * N_RX))
 #define dim_M (N_BEAMS)
 #define dim_N (N_RX)
 #define dim_P (N_SC)
@@ -43,18 +42,18 @@ dump(checkpoint, 1);
 
 uint32_t arrival_index __attribute__((section(".l1_prio")));
 __fp16 l1_pBF_Coef_folded[2 * BANKING_FACTOR * NUM_CORES]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 
-__fp16 l1_pFFT_Src[N_FFTs_ROW * 8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-__fp16 l1_pFFT_Dst[N_FFTs_ROW * 8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-__fp16 l1_twiddleCoef_f16_src[6 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-__fp16 l1_twiddleCoef_f16_dst[6 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+__fp16 l1_pFFT_Src[N_FFTs_ROW * 8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+__fp16 l1_pFFT_Dst[N_FFTs_ROW * 8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_src[6 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_dst[6 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 /* MAIN */
@@ -67,7 +66,7 @@ int main() {
   mempool_start_benchmark();
   if (core_id == 0) {
     // Each FFT is folded over 4 memory rows
-    // Each memory row is 2 * N_BANKS samples
+    // Each memory row is 2 * NUM_BANKS samples
     __atomic_store_n(&arrival_index, 0, __ATOMIC_RELAXED);
     dma_memcpy_blocking(l1_pFFT_Src, l2_pFFT_Src,
                         (N_RX * N_SC) * sizeof(int32_t));
@@ -78,7 +77,7 @@ int main() {
                           dim_M * dim_N * sizeof(int32_t));
     }
     for (uint32_t i = 0; i < N_FFTs_COL; i++) {
-      dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * N_BANKS),
+      dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * NUM_BANKS),
                           l2_twiddleCoef_f16, 3 * (N_SC / 4) * sizeof(int32_t));
     }
   }
@@ -114,7 +113,7 @@ int main() {
     dma_memcpy_blocking(l2_pBF_Dst, l1_pFFT_Dst,
                         (N_RX * N_SC) * sizeof(int32_t));
     for (uint32_t i = 0; i < N_FFTs_COL; i++) {
-      dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * N_BANKS),
+      dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * NUM_BANKS),
                           l2_twiddleCoef_f16, 3 * (N_SC / 4) * sizeof(int32_t));
     }
     __atomic_store_n(&arrival_index, 0, __ATOMIC_RELAXED);
diff --git a/software/kernels/baremetal/mempool_chest_q16.h b/software/kernels/baremetal/mempool_chest_q16.h
index b4d90adff..6e735bbe7 100644
--- a/software/kernels/baremetal/mempool_chest_q16.h
+++ b/software/kernels/baremetal/mempool_chest_q16.h
@@ -6,7 +6,7 @@
 
 #pragma once
 #include "builtins_v2.h"
-#define __MUL
+#define __MUL // Multiplication by pilot instead of division.
 
 /* a[i] = ar[i] + i * ai[j]
    out[i][j] = a[i] / c[j]
diff --git a/software/kernels/baremetal/mempool_cholesky_f16s.h b/software/kernels/baremetal/mempool_cholesky_f16s.h
index 3b42bdb80..1d7b67e36 100644
--- a/software/kernels/baremetal/mempool_cholesky_f16s.h
+++ b/software/kernels/baremetal/mempool_cholesky_f16s.h
@@ -7,7 +7,6 @@
 
 #pragma once
 #include "builtins_v2.h"
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
 
 #ifdef __XDIVSQRT
 
@@ -29,7 +28,7 @@ void mempool_cholesky_f16s(__fp16 *pSrc, __fp16 *pL, const uint32_t n,
   __fp16 ap, bp; // Pivot element
   __fp16 as, bs; // Sum element
   uint32_t i, j, k;
-  const uint32_t offset = folded ? N_BANKS : n;
+  const uint32_t offset = folded ? NUM_BANKS : n;
 
   for (j = 0; j < n; j++) {
     // Elements on diagonal (input matrix is positive-definite)
@@ -103,7 +102,7 @@ void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n,
   v2h apbp, dgdg;
   v2h ab, cd;
   uint32_t i, j, k;
-  const uint32_t offset = folded ? N_BANKS : n;
+  const uint32_t offset = folded ? NUM_BANKS : n;
 
   for (j = 0; j < n; j++) {
 
@@ -383,7 +382,7 @@ void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n,
   v2h ab, cd, ndc;
 
   uint32_t i, j, k;
-  const uint32_t offset = folded ? N_BANKS : n;
+  const uint32_t offset = folded ? NUM_BANKS : n;
 
   for (j = 0; j < n; j++) {
 
diff --git a/software/kernels/baremetal/mempool_cholesky_f32s.h b/software/kernels/baremetal/mempool_cholesky_f32s.h
index 63fd878dc..135d00fcd 100644
--- a/software/kernels/baremetal/mempool_cholesky_f32s.h
+++ b/software/kernels/baremetal/mempool_cholesky_f32s.h
@@ -4,9 +4,6 @@
 
 // Author: Marco Bertuletti, ETH Zurich
 
-#pragma once
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-
 #ifdef __XDIVSQRT
 
 /**
@@ -26,7 +23,7 @@ void mempool_cholesky_f32s(float *pSrc, float *pL, const uint32_t n,
   float ap, bp; // Pivot element
   float as, bs; // Sum element
   uint32_t i, j, k;
-  const uint32_t offset = folded ? N_BANKS : n;
+  const uint32_t offset = folded ? NUM_BANKS : n;
 
   for (j = 0; j < n; j++) {
 
diff --git a/software/kernels/baremetal/mempool_cholesky_q16s.h b/software/kernels/baremetal/mempool_cholesky_q16s.h
index dc20a2b94..fe7c2bd8a 100644
--- a/software/kernels/baremetal/mempool_cholesky_q16s.h
+++ b/software/kernels/baremetal/mempool_cholesky_q16s.h
@@ -7,7 +7,6 @@
 #pragma once
 #include "baremetal/mempool_sqrt_q32s.h"
 #include "builtins_v2.h"
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
 
 /** VECTORIZED CODE
   @brief         Cholesky decomposition with Crout algorithm.
diff --git a/software/kernels/baremetal/mempool_cholesky_q32p.h b/software/kernels/baremetal/mempool_cholesky_q32p.h
index 88819e842..ec40172f5 100644
--- a/software/kernels/baremetal/mempool_cholesky_q32p.h
+++ b/software/kernels/baremetal/mempool_cholesky_q32p.h
@@ -325,13 +325,13 @@ void mempool_cholesky_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, const uint32_t n,
   uint32_t matrix_row = (FoldLeft == 1) ? j : (n - 1 - j);
   /* Elements on the diagonal are computed with a single core */
   if (core_id == core_idx) {
-    pivot = pSrc[j * N_BANKS + j];
+    pivot = pSrc[j * NUM_BANKS + j];
     sum = 0;
     for (k = 0; k < 4 * (j >> 2U); k++) {
-      a0 = pL[matrix_row + k * N_BANKS];
-      a1 = pL[matrix_row + (k + 1) * N_BANKS];
-      a2 = pL[matrix_row + (k + 2) * N_BANKS];
-      a3 = pL[matrix_row + (k + 3) * N_BANKS];
+      a0 = pL[matrix_row + k * NUM_BANKS];
+      a1 = pL[matrix_row + (k + 1) * NUM_BANKS];
+      a2 = pL[matrix_row + (k + 2) * NUM_BANKS];
+      a3 = pL[matrix_row + (k + 3) * NUM_BANKS];
       asm volatile("mul  %[a0],%[a0],%[a0];"
                    "mul  %[a1],%[a1],%[a1];"
                    "mul  %[a2],%[a2],%[a2];"
@@ -355,9 +355,9 @@ void mempool_cholesky_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, const uint32_t n,
     }
     switch (j % 4) {
     case 3:
-      a0 = pL[matrix_row + k * N_BANKS];
-      a1 = pL[matrix_row + (k + 1) * N_BANKS];
-      a2 = pL[matrix_row + (k + 2) * N_BANKS];
+      a0 = pL[matrix_row + k * NUM_BANKS];
+      a1 = pL[matrix_row + (k + 1) * NUM_BANKS];
+      a2 = pL[matrix_row + (k + 2) * NUM_BANKS];
       asm volatile(
           "mul  %[a0],%[a0],%[a0];"
           "mul  %[a1],%[a1],%[a1];"
@@ -376,8 +376,8 @@ void mempool_cholesky_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, const uint32_t n,
           :);
       break;
     case 2:
-      a0 = pL[matrix_row + k * N_BANKS];
-      a1 = pL[matrix_row + (k + 1) * N_BANKS];
+      a0 = pL[matrix_row + k * NUM_BANKS];
+      a1 = pL[matrix_row + (k + 1) * NUM_BANKS];
       asm volatile("mul  %[a0],%[a0],%[a0];"
                    "mul  %[a1],%[a1],%[a1];"
                    "addi %[a0],%[a0],%[h];"
@@ -391,7 +391,7 @@ void mempool_cholesky_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, const uint32_t n,
                    :);
       break;
     case 1:
-      a0 = pL[matrix_row + k * N_BANKS];
+      a0 = pL[matrix_row + k * NUM_BANKS];
       asm volatile("mul  %[a0],%[a0],%[a0];"
                    "addi %[a0],%[a0],%[h];"
                    "srai  %[a0],%[a0],%[s];"
@@ -403,7 +403,8 @@ void mempool_cholesky_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, const uint32_t n,
     case 0:
       break;
     }
-    pL[matrix_row + j * N_BANKS] = mempool_sqrt_q32s(pivot - sum, FIXED_POINT);
+    pL[matrix_row + j * NUM_BANKS] =
+        mempool_sqrt_q32s(pivot - sum, FIXED_POINT);
   }
   return;
 }
@@ -427,17 +428,17 @@ void mempool_cholesky_q32p_divisum(int32_t *pSrc, int32_t *pL, uint32_t core_id,
 
     if (core_id == core_idx) {
       sum = 0;
-      pivot = pSrc[i * N_BANKS + j];
-      diag = pL[jmatrix_row + j * N_BANKS];
+      pivot = pSrc[i * NUM_BANKS + j];
+      diag = pL[jmatrix_row + j * NUM_BANKS];
       for (k = 0; k < 4 * (j >> 2U); k += 4) {
-        a0 = pL[imatrix_row + k * N_BANKS];
-        a1 = pL[imatrix_row + (k + 1) * N_BANKS];
-        a2 = pL[imatrix_row + (k + 2) * N_BANKS];
-        a3 = pL[imatrix_row + (k + 3) * N_BANKS];
-        b0 = pL[jmatrix_row + k * N_BANKS];
-        b1 = pL[jmatrix_row + (k + 1) * N_BANKS];
-        b2 = pL[jmatrix_row + (k + 2) * N_BANKS];
-        b3 = pL[jmatrix_row + (k + 3) * N_BANKS];
+        a0 = pL[imatrix_row + k * NUM_BANKS];
+        a1 = pL[imatrix_row + (k + 1) * NUM_BANKS];
+        a2 = pL[imatrix_row + (k + 2) * NUM_BANKS];
+        a3 = pL[imatrix_row + (k + 3) * NUM_BANKS];
+        b0 = pL[jmatrix_row + k * NUM_BANKS];
+        b1 = pL[jmatrix_row + (k + 1) * NUM_BANKS];
+        b2 = pL[jmatrix_row + (k + 2) * NUM_BANKS];
+        b3 = pL[jmatrix_row + (k + 3) * NUM_BANKS];
         asm volatile("mul  %[a0],%[a0],%[b0];"
                      "mul  %[a1],%[a1],%[b1];"
                      "mul  %[a2],%[a2],%[b2];"
@@ -462,12 +463,12 @@ void mempool_cholesky_q32p_divisum(int32_t *pSrc, int32_t *pL, uint32_t core_id,
       }
       switch (j % 4) {
       case 3:
-        a0 = pL[imatrix_row + k * N_BANKS];
-        a1 = pL[imatrix_row + (k + 1) * N_BANKS];
-        a2 = pL[imatrix_row + (k + 2) * N_BANKS];
-        b0 = pL[jmatrix_row + k * N_BANKS];
-        b1 = pL[jmatrix_row + (k + 1) * N_BANKS];
-        b2 = pL[jmatrix_row + (k + 2) * N_BANKS];
+        a0 = pL[imatrix_row + k * NUM_BANKS];
+        a1 = pL[imatrix_row + (k + 1) * NUM_BANKS];
+        a2 = pL[imatrix_row + (k + 2) * NUM_BANKS];
+        b0 = pL[jmatrix_row + k * NUM_BANKS];
+        b1 = pL[jmatrix_row + (k + 1) * NUM_BANKS];
+        b2 = pL[jmatrix_row + (k + 2) * NUM_BANKS];
         asm volatile(
             "mul  %[a0],%[a0],%[b0];"
             "mul  %[a1],%[a1],%[b1];"
@@ -487,10 +488,10 @@ void mempool_cholesky_q32p_divisum(int32_t *pSrc, int32_t *pL, uint32_t core_id,
             :);
         break;
       case 2:
-        a0 = pL[imatrix_row + k * N_BANKS];
-        a1 = pL[imatrix_row + (k + 1) * N_BANKS];
-        b0 = pL[jmatrix_row + k * N_BANKS];
-        b1 = pL[jmatrix_row + (k + 1) * N_BANKS];
+        a0 = pL[imatrix_row + k * NUM_BANKS];
+        a1 = pL[imatrix_row + (k + 1) * NUM_BANKS];
+        b0 = pL[jmatrix_row + k * NUM_BANKS];
+        b1 = pL[jmatrix_row + (k + 1) * NUM_BANKS];
         asm volatile(
             "mul  %[a0],%[a0],%[b0];"
             "mul  %[a1],%[a1],%[b1];"
@@ -505,8 +506,8 @@ void mempool_cholesky_q32p_divisum(int32_t *pSrc, int32_t *pL, uint32_t core_id,
             :);
         break;
       case 1:
-        a0 = pL[imatrix_row + k * N_BANKS];
-        b0 = pL[jmatrix_row + k * N_BANKS];
+        a0 = pL[imatrix_row + k * NUM_BANKS];
+        b0 = pL[jmatrix_row + k * NUM_BANKS];
         asm volatile("mul  %[a0],%[a0],%[b0];"
                      "addi %[a0],%[a0],%[h];"
                      "srai  %[a0],%[a0],%[s];"
@@ -518,7 +519,7 @@ void mempool_cholesky_q32p_divisum(int32_t *pSrc, int32_t *pL, uint32_t core_id,
       case 0:
         break;
       }
-      pL[imatrix_row + j * N_BANKS] = FIX_DIV((pivot - sum), diag);
+      pL[imatrix_row + j * NUM_BANKS] = FIX_DIV((pivot - sum), diag);
     }
   }
   return;
@@ -557,23 +558,25 @@ void mempool_cholesky_fold_schedule_q32p(int32_t *pSrcA, int32_t *pSrcB,
   for (j = 0; j < n; j++) {
     for (idx_col = column_id; idx_col < n_col; idx_col += n_col) {
       for (idx_row = 0; idx_row < n_row; idx_row++) {
-        mempool_cholesky_q32p_sqrtsum(
-            pSrcA + column_id * n, pLL + idx_col * n + idx_row * (n * N_BANKS),
-            core_id, n, j, 1); // FoldLeft
-        mempool_cholesky_q32p_sqrtsum(
-            pSrcB + column_id * n, pLR + idx_col * n + idx_row * (n * N_BANKS),
-            core_id, n, j, 0); // FoldRight
+        mempool_cholesky_q32p_sqrtsum(pSrcA + column_id * n,
+                                      pLL + idx_col * n +
+                                          idx_row * (n * NUM_BANKS),
+                                      core_id, n, j, 1); // FoldLeft
+        mempool_cholesky_q32p_sqrtsum(pSrcB + column_id * n,
+                                      pLR + idx_col * n +
+                                          idx_row * (n * NUM_BANKS),
+                                      core_id, n, j, 0); // FoldRight
       }
     }
     mempool_log_partial_barrier(2, absolute_core_id, n_col * (n >> 2U));
     for (idx_col = column_id; idx_col < n_col; idx_col += n_col) {
       for (idx_row = 0; idx_row < n_row; idx_row++) {
         mempool_cholesky_q32p_divisum(
-            pSrcA + column_id * n, pLL + idx_col * n + idx_row * (n * N_BANKS),
-            core_id, n, j, 1);
+            pSrcA + column_id * n,
+            pLL + idx_col * n + idx_row * (n * NUM_BANKS), core_id, n, j, 1);
         mempool_cholesky_q32p_divisum(
-            pSrcB + column_id * n, pLR + idx_col * n + idx_row * (n * N_BANKS),
-            core_id, n, j, 0);
+            pSrcB + column_id * n,
+            pLR + idx_col * n + idx_row * (n * NUM_BANKS), core_id, n, j, 0);
       }
     }
     mempool_log_partial_barrier(2, absolute_core_id, n_col * (n >> 2U));
diff --git a/software/kernels/baremetal/mempool_cholesky_q32s.h b/software/kernels/baremetal/mempool_cholesky_q32s.h
index c7a5a60c7..5ce497e96 100644
--- a/software/kernels/baremetal/mempool_cholesky_q32s.h
+++ b/software/kernels/baremetal/mempool_cholesky_q32s.h
@@ -320,7 +320,7 @@ void mempool_cholesky_schedule_q32s(int32_t *pSrc, int32_t *pL,
   uint32_t idx_row, idx_col = core_id;
   for (idx_row = 0; idx_row < n_row; idx_row++) {
     mempool_cholesky_crout_q32s(pSrc + idx_col * n,
-                                pL + idx_col * n + idx_row * N_BANKS, n);
+                                pL + idx_col * n + idx_row * NUM_BANKS, n);
   }
   mempool_log_partial_barrier(2, core_id, n_col * (n >> 2U));
 }
diff --git a/software/kernels/baremetal/mempool_cmatmul_f16.h b/software/kernels/baremetal/mempool_cmatmul_f16.h
index 12645c454..374144f64 100644
--- a/software/kernels/baremetal/mempool_cmatmul_f16.h
+++ b/software/kernels/baremetal/mempool_cmatmul_f16.h
@@ -13,10 +13,8 @@
 
 #pragma once
 #include "builtins_v2.h"
-// Use complex dotp in a single offload
-#define __CDOTP
-// Shift cores startpoint over rows of matrix A
-#define __SHIFT_A
+#define __CDOTP   // Use complex dotp in a single offload
+#define __SHIFT_A // Shift cores startpoint over rows of matrix A
 
 /******************************************************************************
  __        ___     _            _                   ____        _
@@ -559,7 +557,7 @@ void cmatmul_4x4_f16p(__fp16 const *__restrict__ A,
   return;
 }
 
-// 4x4 MATMUL with copies of A matrix (for M*N < N_BANKS)
+// 4x4 MATMUL with copies of A matrix (for M*N < NUM_BANKS)
 void cmatmul_4x4_f16p_copy_A(__fp16 const *__restrict__ A_l2,
                              __fp16 *__restrict__ A_l1,
                              __fp16 const *__restrict__ B,
diff --git a/software/kernels/baremetal/mempool_cmatmul_q16.h b/software/kernels/baremetal/mempool_cmatmul_q16.h
index aa6a71b6c..78ed04b31 100644
--- a/software/kernels/baremetal/mempool_cmatmul_q16.h
+++ b/software/kernels/baremetal/mempool_cmatmul_q16.h
@@ -13,8 +13,7 @@
 
 #pragma once
 #include "builtins_v2.h"
-// Shift cores startpoint over rows of matrix A
-#define __SHIFT_A
+#define __SHIFT_A // Shift cores startpoint over rows of matrix A
 
 #define CMATMUL_1x1_LOOP                                                       \
   v2s sum = {0, 0};                                                            \
diff --git a/software/kernels/baremetal/mempool_dotp_f16.h b/software/kernels/baremetal/mempool_dotp_f16.h
index 791b7c68e..e8083cfcf 100644
--- a/software/kernels/baremetal/mempool_dotp_f16.h
+++ b/software/kernels/baremetal/mempool_dotp_f16.h
@@ -7,6 +7,16 @@
 #pragma once
 #include "builtins_v2.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE_CORE_REDUCTION: Reduction with a single-core.
+BINARY_REDUCTION: Reduction with binary tree.
+*/
+
+#define SINGLE_CORE_REDUCTION
+
 #define DOTPF16VEC_UNROLLED4_LOOP                                              \
   {                                                                            \
     a01 = (*(v2h *)&in_a[i]);                                                  \
diff --git a/software/kernels/baremetal/mempool_dotp_f32.h b/software/kernels/baremetal/mempool_dotp_f32.h
index 290b96d59..13e4ff9e5 100644
--- a/software/kernels/baremetal/mempool_dotp_f32.h
+++ b/software/kernels/baremetal/mempool_dotp_f32.h
@@ -4,6 +4,16 @@
 
 // Author: Marco Bertuletti, ETH Zurich
 
+/*
+======================
+Parameters and defines
+
+SINGLE_CORE_REDUCTION: Reduction with a single-core.
+BINARY_REDUCTION: Reduction with binary tree.
+*/
+
+#define SINGLE_CORE_REDUCTION
+
 #define DOTPF32_UNROLLED4_LOOP                                                 \
   {                                                                            \
     a0 = in_a[i];                                                              \
diff --git a/software/kernels/baremetal/mempool_dotp_i32.h b/software/kernels/baremetal/mempool_dotp_i32.h
index 4b80e92ed..3f8320b91 100644
--- a/software/kernels/baremetal/mempool_dotp_i32.h
+++ b/software/kernels/baremetal/mempool_dotp_i32.h
@@ -4,6 +4,18 @@
 
 // Author: Marco Bertuletti, ETH Zurich
 
+/*
+======================
+Parameters and defines
+
+SINGLE_CORE_REDUCTION: Reduction with a single-core.
+BINARY_REDUCTION: Reduction with binary tree.
+ATOMIC_REDUCTION: Reduction with atomics.
+LOG_BARRIERS: Use binary reduction
+*/
+
+#define SINGLE_CORE_REDUCTION
+
 #define DOTPI32_UNROLLED4_LOOP                                                 \
   {                                                                            \
     a0 = in_a[i];                                                              \
diff --git a/software/kernels/baremetal/mempool_linearsolver_f16s.h b/software/kernels/baremetal/mempool_linearsolver_f16s.h
index c4e134527..919fd3585 100644
--- a/software/kernels/baremetal/mempool_linearsolver_f16s.h
+++ b/software/kernels/baremetal/mempool_linearsolver_f16s.h
@@ -5,8 +5,6 @@
 // Author: Marco Bertuletti, ETH Zurich
 
 #pragma once
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-
 #ifdef __XDIVSQRT
 
 /**
@@ -30,7 +28,7 @@ void mempool_Ltrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, const uint32_t n,
   __fp16 as, bs;
   __fp16 ax, bx;
   __fp16 diag;
-  const uint32_t offset = folded ? N_BANKS : n;
+  const uint32_t offset = folded ? NUM_BANKS : n;
 
   // Solve for each variable x_i in turn
   for (i = 0; i < n; i++) {
@@ -98,7 +96,7 @@ void mempool_Ltrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, const uint32_t n,
 
   __fp16 as, bs;
   __fp16 diag;
-  const uint32_t offset = folded ? N_BANKS : n;
+  const uint32_t offset = folded ? NUM_BANKS : n;
 
   float ax, bx, diag_f32;
   v2h res;
diff --git a/software/kernels/baremetal/mempool_linearsolver_f32s.h b/software/kernels/baremetal/mempool_linearsolver_f32s.h
index d3297397c..02f38b698 100644
--- a/software/kernels/baremetal/mempool_linearsolver_f32s.h
+++ b/software/kernels/baremetal/mempool_linearsolver_f32s.h
@@ -5,8 +5,6 @@
 // Author: Marco Bertuletti, ETH Zurich
 
 #pragma once
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-
 #ifdef __XDIVSQRT
 
 /**
@@ -29,7 +27,7 @@ void mempool_Ltrisol_f32s(float *pL, float *in, float *x, const uint32_t n,
   float as, bs;
   float ax, bx;
   float diag;
-  const uint32_t offset = folded ? N_BANKS : n;
+  const uint32_t offset = folded ? NUM_BANKS : n;
 
   // Solve for each variable x_i in turn
   for (i = 0; i < n; i++) {
diff --git a/software/kernels/baremetal/mempool_linearsolver_q32p.h b/software/kernels/baremetal/mempool_linearsolver_q32p.h
index 49b629259..5fbc2f230 100644
--- a/software/kernels/baremetal/mempool_linearsolver_q32p.h
+++ b/software/kernels/baremetal/mempool_linearsolver_q32p.h
@@ -135,13 +135,13 @@ void mempool_linearsolver_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, int32_t *pIn,
   /* Elements on the diagonal are computed with a single core */
   if (core_id == core_idx) {
     in = pIn[j];
-    pivot = pSrc[matrix_row * N_BANKS + j];
+    pivot = pSrc[matrix_row * NUM_BANKS + j];
     sum = 0;
     for (k = 0; k < 4 * (j >> 2U); k++) {
-      a0 = pL[matrix_row + k * N_BANKS];
-      a1 = pL[matrix_row + (k + 1) * N_BANKS];
-      a2 = pL[matrix_row + (k + 2) * N_BANKS];
-      a3 = pL[matrix_row + (k + 3) * N_BANKS];
+      a0 = pL[matrix_row + k * NUM_BANKS];
+      a1 = pL[matrix_row + (k + 1) * NUM_BANKS];
+      a2 = pL[matrix_row + (k + 2) * NUM_BANKS];
+      a3 = pL[matrix_row + (k + 3) * NUM_BANKS];
       asm volatile("mul  %[a0],%[a0],%[a0];"
                    "mul  %[a1],%[a1],%[a1];"
                    "mul  %[a2],%[a2],%[a2];"
@@ -165,9 +165,9 @@ void mempool_linearsolver_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, int32_t *pIn,
     }
     switch (j % 4) {
     case 3:
-      a0 = pL[matrix_row + k * N_BANKS];
-      a1 = pL[matrix_row + (k + 1) * N_BANKS];
-      a2 = pL[matrix_row + (k + 2) * N_BANKS];
+      a0 = pL[matrix_row + k * NUM_BANKS];
+      a1 = pL[matrix_row + (k + 1) * NUM_BANKS];
+      a2 = pL[matrix_row + (k + 2) * NUM_BANKS];
       asm volatile(
           "mul  %[a0],%[a0],%[a0];"
           "mul  %[a1],%[a1],%[a1];"
@@ -186,8 +186,8 @@ void mempool_linearsolver_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, int32_t *pIn,
           :);
       break;
     case 2:
-      a0 = pL[matrix_row + k * N_BANKS];
-      a1 = pL[matrix_row + (k + 1) * N_BANKS];
+      a0 = pL[matrix_row + k * NUM_BANKS];
+      a1 = pL[matrix_row + (k + 1) * NUM_BANKS];
       asm volatile("mul  %[a0],%[a0],%[a0];"
                    "mul  %[a1],%[a1],%[a1];"
                    "addi %[a0],%[a0],%[h];"
@@ -201,7 +201,7 @@ void mempool_linearsolver_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, int32_t *pIn,
                    :);
       break;
     case 1:
-      a0 = pL[matrix_row + k * N_BANKS];
+      a0 = pL[matrix_row + k * NUM_BANKS];
       asm volatile("mul  %[a0],%[a0],%[a0];"
                    "addi %[a0],%[a0],%[h];"
                    "srai  %[a0],%[a0],%[s];"
@@ -215,7 +215,7 @@ void mempool_linearsolver_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, int32_t *pIn,
     }
     result = mempool_sqrt_q32s(pivot - sum, FIXED_POINT);
     pIn[j] = FIX_DIV(in, result);
-    pL[matrix_row + j * N_BANKS] = result;
+    pL[matrix_row + j * NUM_BANKS] = result;
   }
 }
 
@@ -238,19 +238,19 @@ void mempool_linearsolver_q32p_divisum(int32_t *pSrc, int32_t *pL, int32_t *pIn,
 
     if (core_id == core_idx) {
       sum = 0;
-      pivot = pSrc[i * N_BANKS + j];
-      diag = pL[jmatrix_row + j * N_BANKS];
+      pivot = pSrc[i * NUM_BANKS + j];
+      diag = pL[jmatrix_row + j * NUM_BANKS];
       in = pIn[j];
       sum_r = pIn[i];
       for (k = 0; k < 4 * (j >> 2U); k += 4) {
-        a0 = pL[imatrix_row + k * N_BANKS];
-        a1 = pL[imatrix_row + (k + 1) * N_BANKS];
-        a2 = pL[imatrix_row + (k + 2) * N_BANKS];
-        a3 = pL[imatrix_row + (k + 3) * N_BANKS];
-        b0 = pL[jmatrix_row + k * N_BANKS];
-        b1 = pL[jmatrix_row + (k + 1) * N_BANKS];
-        b2 = pL[jmatrix_row + (k + 2) * N_BANKS];
-        b3 = pL[jmatrix_row + (k + 3) * N_BANKS];
+        a0 = pL[imatrix_row + k * NUM_BANKS];
+        a1 = pL[imatrix_row + (k + 1) * NUM_BANKS];
+        a2 = pL[imatrix_row + (k + 2) * NUM_BANKS];
+        a3 = pL[imatrix_row + (k + 3) * NUM_BANKS];
+        b0 = pL[jmatrix_row + k * NUM_BANKS];
+        b1 = pL[jmatrix_row + (k + 1) * NUM_BANKS];
+        b2 = pL[jmatrix_row + (k + 2) * NUM_BANKS];
+        b3 = pL[jmatrix_row + (k + 3) * NUM_BANKS];
         asm volatile("mul  %[a0],%[a0],%[b0];"
                      "mul  %[a1],%[a1],%[b1];"
                      "mul  %[a2],%[a2],%[b2];"
@@ -275,12 +275,12 @@ void mempool_linearsolver_q32p_divisum(int32_t *pSrc, int32_t *pL, int32_t *pIn,
       }
       switch (j % 4) {
       case 3:
-        a0 = pL[imatrix_row + k * N_BANKS];
-        a1 = pL[imatrix_row + (k + 1) * N_BANKS];
-        a2 = pL[imatrix_row + (k + 2) * N_BANKS];
-        b0 = pL[jmatrix_row + k * N_BANKS];
-        b1 = pL[jmatrix_row + (k + 1) * N_BANKS];
-        b2 = pL[jmatrix_row + (k + 2) * N_BANKS];
+        a0 = pL[imatrix_row + k * NUM_BANKS];
+        a1 = pL[imatrix_row + (k + 1) * NUM_BANKS];
+        a2 = pL[imatrix_row + (k + 2) * NUM_BANKS];
+        b0 = pL[jmatrix_row + k * NUM_BANKS];
+        b1 = pL[jmatrix_row + (k + 1) * NUM_BANKS];
+        b2 = pL[jmatrix_row + (k + 2) * NUM_BANKS];
         asm volatile(
             "mul  %[a0],%[a0],%[b0];"
             "mul  %[a1],%[a1],%[b1];"
@@ -300,10 +300,10 @@ void mempool_linearsolver_q32p_divisum(int32_t *pSrc, int32_t *pL, int32_t *pIn,
             :);
         break;
       case 2:
-        a0 = pL[imatrix_row + k * N_BANKS];
-        a1 = pL[imatrix_row + (k + 1) * N_BANKS];
-        b0 = pL[jmatrix_row + k * N_BANKS];
-        b1 = pL[jmatrix_row + (k + 1) * N_BANKS];
+        a0 = pL[imatrix_row + k * NUM_BANKS];
+        a1 = pL[imatrix_row + (k + 1) * NUM_BANKS];
+        b0 = pL[jmatrix_row + k * NUM_BANKS];
+        b1 = pL[jmatrix_row + (k + 1) * NUM_BANKS];
         asm volatile(
             "mul  %[a0],%[a0],%[b0];"
             "mul  %[a1],%[a1],%[b1];"
@@ -318,8 +318,8 @@ void mempool_linearsolver_q32p_divisum(int32_t *pSrc, int32_t *pL, int32_t *pIn,
             :);
         break;
       case 1:
-        a0 = pL[imatrix_row + k * N_BANKS];
-        b0 = pL[jmatrix_row + k * N_BANKS];
+        a0 = pL[imatrix_row + k * NUM_BANKS];
+        b0 = pL[jmatrix_row + k * NUM_BANKS];
         asm volatile("mul  %[a0],%[a0],%[b0];"
                      "addi %[a0],%[a0],%[h];"
                      "srai  %[a0],%[a0],%[s];"
@@ -333,7 +333,7 @@ void mempool_linearsolver_q32p_divisum(int32_t *pSrc, int32_t *pL, int32_t *pIn,
       }
       result = FIX_DIV((pivot - sum), diag);
       pIn[i] = sum_r - result * in;
-      pL[imatrix_row + j * N_BANKS] = result;
+      pL[imatrix_row + j * NUM_BANKS] = result;
     }
   }
 }
@@ -353,10 +353,10 @@ void mempool_linearsolver_q32p_trisolverL(int32_t *pL, int32_t *pIn,
         a1 = pIn[k - 1];
         a2 = pIn[k - 2];
         a3 = pIn[k - 3];
-        b0 = pL[k + i * N_BANKS];
-        b1 = pL[(k - 1) + i * N_BANKS];
-        b2 = pL[(k - 2) + i * N_BANKS];
-        b3 = pL[(k - 3) + i * N_BANKS];
+        b0 = pL[k + i * NUM_BANKS];
+        b1 = pL[(k - 1) + i * NUM_BANKS];
+        b2 = pL[(k - 2) + i * NUM_BANKS];
+        b3 = pL[(k - 3) + i * NUM_BANKS];
         asm volatile("mul  %[a0],%[a0],%[b0];"
                      "mul  %[a1],%[a1],%[b1];"
                      "mul  %[a2],%[a2],%[b2];"
@@ -384,9 +384,9 @@ void mempool_linearsolver_q32p_trisolverL(int32_t *pL, int32_t *pIn,
         a0 = pIn[k];
         a1 = pIn[k - 1];
         a2 = pIn[k - 2];
-        b0 = pL[k + i * N_BANKS];
-        b1 = pL[(k - 1) + i * N_BANKS];
-        b2 = pL[(k - 2) + i * N_BANKS];
+        b0 = pL[k + i * NUM_BANKS];
+        b1 = pL[(k - 1) + i * NUM_BANKS];
+        b2 = pL[(k - 2) + i * NUM_BANKS];
         asm volatile(
             "mul  %[a0],%[a0],%[b0];"
             "mul  %[a1],%[a1],%[b1];"
@@ -408,8 +408,8 @@ void mempool_linearsolver_q32p_trisolverL(int32_t *pL, int32_t *pIn,
       case 2:
         a0 = pIn[k];
         a1 = pIn[k - 1];
-        b0 = pL[k + i * N_BANKS];
-        b1 = pL[(k - 1) + i * N_BANKS];
+        b0 = pL[k + i * NUM_BANKS];
+        b1 = pL[(k - 1) + i * NUM_BANKS];
         asm volatile(
             "mul  %[a0],%[a0],%[b0];"
             "mul  %[a1],%[a1],%[b1];"
@@ -425,7 +425,7 @@ void mempool_linearsolver_q32p_trisolverL(int32_t *pL, int32_t *pIn,
         break;
       case 3:
         a0 = pIn[k];
-        b0 = pL[k + i * N_BANKS];
+        b0 = pL[k + i * NUM_BANKS];
         asm volatile("mul  %[a0],%[a0],%[b0];"
                      "addi %[a0],%[a0],%[h];"
                      "srai %[a0],%[a0],%[s];"
@@ -437,7 +437,7 @@ void mempool_linearsolver_q32p_trisolverL(int32_t *pL, int32_t *pIn,
       case 0:
         break;
       }
-      pIn[i] = FIX_DIV(sum, pL[i * N_BANKS + i]);
+      pIn[i] = FIX_DIV(sum, pL[i * NUM_BANKS + i]);
     }
   }
 }
@@ -457,10 +457,10 @@ void mempool_linearsolver_q32p_trisolverR(int32_t *pL, int32_t *pIn,
         a1 = pIn[n - 1 - k - 1];
         a2 = pIn[n - 1 - k - 2];
         a3 = pIn[n - 1 - k - 3];
-        b0 = pL[k + i * N_BANKS];
-        b1 = pL[(k + 1) + i * N_BANKS];
-        b2 = pL[(k + 2) + i * N_BANKS];
-        b3 = pL[(k + 3) + i * N_BANKS];
+        b0 = pL[k + i * NUM_BANKS];
+        b1 = pL[(k + 1) + i * NUM_BANKS];
+        b2 = pL[(k + 2) + i * NUM_BANKS];
+        b3 = pL[(k + 3) + i * NUM_BANKS];
         asm volatile("mul  %[a0],%[a0],%[b0];"
                      "mul  %[a1],%[a1],%[b1];"
                      "mul  %[a2],%[a2],%[b2];"
@@ -488,9 +488,9 @@ void mempool_linearsolver_q32p_trisolverR(int32_t *pL, int32_t *pIn,
         a0 = pIn[n - 1 - k];
         a1 = pIn[n - 1 - k - 1];
         a2 = pIn[n - 1 - k - 2];
-        b0 = pL[k + i * N_BANKS];
-        b1 = pL[(k + 1) + i * N_BANKS];
-        b2 = pL[(k + 2) + i * N_BANKS];
+        b0 = pL[k + i * NUM_BANKS];
+        b1 = pL[(k + 1) + i * NUM_BANKS];
+        b2 = pL[(k + 2) + i * NUM_BANKS];
         asm volatile(
             "mul  %[a0],%[a0],%[b0];"
             "mul  %[a1],%[a1],%[b1];"
@@ -512,8 +512,8 @@ void mempool_linearsolver_q32p_trisolverR(int32_t *pL, int32_t *pIn,
       case 2:
         a0 = pIn[n - 1 - k];
         a1 = pIn[n - 1 - k - 1];
-        b0 = pL[k + i * N_BANKS];
-        b1 = pL[(k + 1) + i * N_BANKS];
+        b0 = pL[k + i * NUM_BANKS];
+        b1 = pL[(k + 1) + i * NUM_BANKS];
         asm volatile(
             "mul  %[a0],%[a0],%[b0];"
             "mul  %[a1],%[a1],%[b1];"
@@ -529,7 +529,7 @@ void mempool_linearsolver_q32p_trisolverR(int32_t *pL, int32_t *pIn,
         break;
       case 1:
         a0 = pIn[n - 1 - k];
-        b0 = pL[k + i * N_BANKS];
+        b0 = pL[k + i * NUM_BANKS];
         asm volatile("mul  %[a0],%[a0],%[b0];"
                      "addi %[a0],%[a0],%[h];"
                      "srai %[a0],%[a0],%[s];"
@@ -541,7 +541,7 @@ void mempool_linearsolver_q32p_trisolverR(int32_t *pL, int32_t *pIn,
       case 0:
         break;
       }
-      pIn[i] = FIX_DIV(sum, pL[i * N_BANKS + i]);
+      pIn[i] = FIX_DIV(sum, pL[i * NUM_BANKS + i]);
     }
   }
 }
@@ -585,10 +585,10 @@ void mempool_linearsolver_fold_schedule_q32p(int32_t *pSrcA, int32_t *pSrcB,
     for (idx_col = column_id; idx_col < n_col; idx_col += n_col) {
       for (idx_row = 0; idx_row < n_row; idx_row++) {
         mempool_linearsolver_q32p_sqrtsum(
-            pSrcA + idx_col * n, pLL + idx_col * n + idx_row * (n * N_BANKS),
+            pSrcA + idx_col * n, pLL + idx_col * n + idx_row * (n * NUM_BANKS),
             pIn + idx_col * n, core_id, n, j, 1);
         mempool_linearsolver_q32p_sqrtsum(
-            pSrcB + idx_col * n, pLR + idx_col * n + idx_row * (n * N_BANKS),
+            pSrcB + idx_col * n, pLR + idx_col * n + idx_row * (n * NUM_BANKS),
             pIn + idx_col * n, core_id, n, j, 0);
       }
     }
@@ -596,10 +596,10 @@ void mempool_linearsolver_fold_schedule_q32p(int32_t *pSrcA, int32_t *pSrcB,
     for (idx_col = column_id; idx_col < n_col; idx_col += n_col) {
       for (idx_row = 0; idx_row < n_row; idx_row++) {
         mempool_linearsolver_q32p_divisum(
-            pSrcA + idx_col * n, pLL + idx_col * n + idx_row * (n * N_BANKS),
-            pIn + idx_col * n + idx_row * N_BANKS, core_id, n, j, 1);
+            pSrcA + idx_col * n, pLL + idx_col * n + idx_row * (n * NUM_BANKS),
+            pIn + idx_col * n + idx_row * NUM_BANKS, core_id, n, j, 1);
         mempool_linearsolver_q32p_divisum(
-            pSrcB + idx_col * n, pLR + idx_col * n + idx_row * (n * N_BANKS),
+            pSrcB + idx_col * n, pLR + idx_col * n + idx_row * (n * NUM_BANKS),
             pIn + idx_col * n, core_id, n, j, 0);
       }
     }
@@ -609,10 +609,10 @@ void mempool_linearsolver_fold_schedule_q32p(int32_t *pSrcA, int32_t *pSrcB,
   for (idx_col = column_id; idx_col < n_col; idx_col += n_col) {
     for (idx_row = 0; idx_row < n_row; idx_row++) {
       mempool_linearsolver_q32p_trisolverL(pLL + idx_col * n +
-                                               idx_row * (n * N_BANKS),
+                                               idx_row * (n * NUM_BANKS),
                                            pIn + idx_col * n, core_id, n);
       mempool_linearsolver_q32p_trisolverR(
-          pLR + idx_col * n + idx_row * (n * N_BANKS), pIn + idx_col * n,
+          pLR + idx_col * n + idx_row * (n * NUM_BANKS), pIn + idx_col * n,
           core_id, n, n_col * (n >> 2U));
     }
   }
diff --git a/software/kernels/baremetal/mempool_linearsolver_q32s.h b/software/kernels/baremetal/mempool_linearsolver_q32s.h
index d03ec273f..848c0fdeb 100644
--- a/software/kernels/baremetal/mempool_linearsolver_q32s.h
+++ b/software/kernels/baremetal/mempool_linearsolver_q32s.h
@@ -24,7 +24,7 @@ void mempool_lowtrisolver_q32s(int32_t *pL, int32_t *pIn, const uint32_t n,
   int32_t in0, in1, in2, in3;
   int32_t l0, l1, l2, l3;
 
-  uint32_t OFFSET = (folded == 1) ? N_BANKS : n;
+  uint32_t OFFSET = (folded == 1) ? NUM_BANKS : n;
 
   for (i = 0; i < n; i++) {
     sum = pIn[i];
@@ -140,7 +140,7 @@ void mempool_uprtrisolver_q32s(int32_t *pL, int32_t volatile *pIn,
   int32_t in0, in1, in2, in3;
   int32_t l0, l1, l2, l3;
 
-  uint32_t OFFSET = (folded == 1) ? N_BANKS : n;
+  uint32_t OFFSET = (folded == 1) ? NUM_BANKS : n;
 
   for (i = n - 1; i < n; i--) {
     sum = pIn[i];
@@ -266,7 +266,7 @@ void mempool_linearsolver_q32s(int32_t *pSrc, int32_t *pL,
   int32_t a0, a1, a2, a3;
   int32_t b0, b1, b2, b3;
 
-  uint32_t OFFSET = (folded == 1) ? N_BANKS : n;
+  uint32_t OFFSET = (folded == 1) ? NUM_BANKS : n;
 
   for (j = 0; j < n; j++) {
     in = pIn[j];
@@ -483,9 +483,9 @@ void mempool_linearsolver_schedule_q32s(int32_t *pSrc, int32_t *pL,
   uint32_t idx_row, idx_col = core_id;
   for (idx_row = 0; idx_row < n_row; idx_row++) {
     mempool_linearsolver_q32s(pSrc + idx_col * n,
-                              pL + idx_col * n + idx_row * N_BANKS,
+                              pL + idx_col * n + idx_row * NUM_BANKS,
                               pIn + idx_col * n, n, 1);
-    mempool_uprtrisolver_q32s(pL + idx_col * n + idx_row * N_BANKS,
+    mempool_uprtrisolver_q32s(pL + idx_col * n + idx_row * NUM_BANKS,
                               pIn + idx_col * n, n, 1);
   }
   mempool_log_partial_barrier(2, core_id, n_col * (n >> 2U));
diff --git a/software/kernels/baremetal/mempool_matmul_f32.h b/software/kernels/baremetal/mempool_matmul_f32.h
index 8879fa52d..c6b669bcc 100644
--- a/software/kernels/baremetal/mempool_matmul_f32.h
+++ b/software/kernels/baremetal/mempool_matmul_f32.h
@@ -13,6 +13,8 @@
 
 #pragma once
 #include "builtins_v2.h"
+// The 4x4 matmul is executed with asm_volatile statement
+#define ASM
 
 void matmul_2x2_single_f32(float const *__restrict__ A,
                            float const *__restrict__ B, float *__restrict__ C,
diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f16s.h b/software/kernels/baremetal/mempool_mimo_mmse_f16s.h
index 91e3aa789..134da8905 100644
--- a/software/kernels/baremetal/mempool_mimo_mmse_f16s.h
+++ b/software/kernels/baremetal/mempool_mimo_mmse_f16s.h
@@ -7,7 +7,6 @@
 
 #pragma once
 #include "builtins_v2.h"
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
 
 /******************************************************************************
   _____ __ _
@@ -112,7 +111,7 @@ void mempool_hermitian_f16s(__fp16 *pH, __fp16 *pG, __fp16 *pS,
           bs3 = (__fp16)0.0f;
         }
       }
-      uint32_t const offset = folded ? N_BANKS : n_tx;
+      uint32_t const offset = folded ? NUM_BANKS : n_tx;
       // Store
       pG[2 * (i * offset + j)] = as0;
       pG[2 * (i * offset + j + 1U)] = as1;
@@ -285,7 +284,7 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS,
           asm volatile("fadd.h  %0, %0, %1;" : "+&r"(res0) : "r"(pS[2 * i]));
         }
         // Store
-        uint32_t addr = folded ? 2 * (i * N_BANKS + j) : 2 * (i * n_tx + j);
+        uint32_t addr = folded ? 2 * (i * NUM_BANKS + j) : 2 * (i * n_tx + j);
         (*(v2h *)&pG[addr]) = res0;
       }
 
@@ -356,7 +355,7 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS,
             asm volatile("fadd.h  %0, %0, %1;" : "+&r"(res3) : "r"(pS[2 * i]));
           }
         }
-        uint32_t const offset = folded ? N_BANKS : n_tx;
+        uint32_t const offset = folded ? NUM_BANKS : n_tx;
         // Store
         (*(v2h *)&pG[2 * (i * offset + j)]) = res0;
         (*(v2h *)&pG[2 * (i * offset + j + 1U)]) = res1;
@@ -415,7 +414,7 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS,
             asm volatile("fadd.h  %0, %0, %1;" : "+&r"(res3) : "r"(pS[2 * i]));
           }
         }
-        uint32_t const offset = folded ? N_BANKS : n_tx;
+        uint32_t const offset = folded ? NUM_BANKS : n_tx;
         // Store
         (*(v2h *)&pG[2 * (i * offset + j)]) = res0;
         (*(v2h *)&pG[2 * (i * offset + j + 1U)]) = res1;
diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f32p.h b/software/kernels/baremetal/mempool_mimo_mmse_f32p.h
index 7e3e6fe1a..13ebcb537 100644
--- a/software/kernels/baremetal/mempool_mimo_mmse_f32p.h
+++ b/software/kernels/baremetal/mempool_mimo_mmse_f32p.h
@@ -5,7 +5,6 @@
 // Author: Marco Bertuletti, ETH Zurich
 
 #pragma once
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
 
 /**
   @brief         Computes the Hermitian matrix G = (H'*H + pS^2I).
@@ -102,7 +101,7 @@ void mempool_hermitian_f32p(float *pH, float *pG, float *pS,
           bs3 = 0.0f;
         }
       }
-      uint32_t const offset = folded ? N_BANKS : n_tx;
+      uint32_t const offset = folded ? NUM_BANKS : n_tx;
       // Store
       pG[2 * (i * offset + j)] = as0;
       pG[2 * (i * offset + j + 1U)] = as1;
diff --git a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h
index 3ce36f3b6..12b2320e5 100644
--- a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h
+++ b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h
@@ -48,7 +48,7 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut,
 // STORE INDEXES
 #if defined(FOLDED) || defined(SCHEDULED)
   uint32_t n2_store = n2 >> 2U;
-  i0_store = (i0 % n2_store) + (i0 / n2_store) * N_BANKS;
+  i0_store = (i0 % n2_store) + (i0 / n2_store) * NUM_BANKS;
   i1_store = i0_store + n2_store;
   i2_store = i1_store + n2_store;
   i3_store = i2_store + n2_store;
@@ -158,9 +158,9 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut,
   /*  index calculation for the input as, */
   /*  pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 +
    * 3fftLen/4] */
-  i1 = i0 + N_BANKS;
-  i2 = i1 + N_BANKS;
-  i3 = i2 + N_BANKS;
+  i1 = i0 + NUM_BANKS;
+  i2 = i1 + NUM_BANKS;
+  i3 = i2 + NUM_BANKS;
 #else
   /*  index calculation for the input as, */
   /*  pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 +
@@ -173,7 +173,7 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut,
 #if defined(FOLDED) || defined(SCHEDULED)
   uint32_t n2_store = n2 >> 2U;
   i0_store =
-      (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS;
+      (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * NUM_BANKS;
   i1_store = i0_store + n2_store;
   i2_store = i1_store + n2_store;
   i3_store = i2_store + n2_store;
@@ -278,9 +278,9 @@ static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut,
   /*  index calculation for the input as, */
   /*  pIn[i0 + 0], pIn[i0 + fftLen/4],
       pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
-  i1 = i0 + N_BANKS;
-  i2 = i1 + N_BANKS;
-  i3 = i2 + N_BANKS;
+  i1 = i0 + NUM_BANKS;
+  i2 = i1 + NUM_BANKS;
+  i3 = i2 + NUM_BANKS;
 #else
   /*  index calculation for the input as, */
   /*  pIn[i0 + 0], pIn[i0 + fftLen/4],
diff --git a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_q16.h b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_q16.h
index a0c1a7791..5db6be1cf 100644
--- a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_q16.h
+++ b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_q16.h
@@ -50,7 +50,7 @@ static inline void radix4_butterfly_first(int16_t *pIn, int16_t *pOut,
 // STORE INDEXES
 #if defined(FOLDED) || defined(SCHEDULED)
   uint32_t n2_store = n2 >> 2U;
-  i0_store = (i0 % n2_store) + (i0 / n2_store) * N_BANKS;
+  i0_store = (i0 % n2_store) + (i0 / n2_store) * NUM_BANKS;
   i1_store = i0_store + n2_store;
   i2_store = i1_store + n2_store;
   i3_store = i2_store + n2_store;
@@ -227,9 +227,9 @@ static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut,
   /*  index calculation for the input as, */
   /*  pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 +
    * 3fftLen/4] */
-  i1 = i0 + N_BANKS;
-  i2 = i1 + N_BANKS;
-  i3 = i2 + N_BANKS;
+  i1 = i0 + NUM_BANKS;
+  i2 = i1 + NUM_BANKS;
+  i3 = i2 + NUM_BANKS;
 #else
   /*  index calculation for the input as, */
   /*  pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 +
@@ -242,7 +242,7 @@ static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut,
 #if defined(FOLDED) || defined(SCHEDULED)
   uint32_t n2_store = n2 >> 2U;
   i0_store =
-      (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS;
+      (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * NUM_BANKS;
   i1_store = i0_store + n2_store;
   i2_store = i1_store + n2_store;
   i3_store = i2_store + n2_store;
@@ -403,9 +403,9 @@ static inline void radix4_butterfly_last(int16_t *pIn, int16_t *pOut,
   /*  index calculation for the input as, */
   /*  pIn[i0 + 0], pIn[i0 + fftLen/4],
       pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
-  i1 = i0 + N_BANKS;
-  i2 = i1 + N_BANKS;
-  i3 = i2 + N_BANKS;
+  i1 = i0 + NUM_BANKS;
+  i2 = i1 + NUM_BANKS;
+  i3 = i2 + NUM_BANKS;
 #else
   /*  index calculation for the input as, */
   /*  pIn[i0 + 0], pIn[i0 + fftLen/4],
diff --git a/software/kernels/baremetal/mempool_radix4_cfft_f16p.h b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h
index c6b4acf6b..90a3fd093 100644
--- a/software/kernels/baremetal/mempool_radix4_cfft_f16p.h
+++ b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h
@@ -5,7 +5,6 @@
 // Author: Marco Bertuletti, ETH Zurich
 
 #pragma once
-#define BITREVERSETABLE
 #include "builtins_v2.h"
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 
@@ -33,19 +32,19 @@
 #ifdef FOLDED_TWIDDLES
 #define LOAD_STORE_TWIDDLEFACT                                                 \
   CoSi1 = *(v2h *)&pCoef_src[2U * ic];                                         \
-  CoSi2 = *(v2h *)&pCoef_src[2U * (ic + 1 * N_BANKS)];                         \
-  CoSi3 = *(v2h *)&pCoef_src[2U * (ic + 2 * N_BANKS)];                         \
+  CoSi2 = *(v2h *)&pCoef_src[2U * (ic + 1 * NUM_BANKS)];                       \
+  CoSi3 = *(v2h *)&pCoef_src[2U * (ic + 2 * NUM_BANKS)];                       \
   if (ic % 4 == 0) {                                                           \
     *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi1;                             \
     *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi1;              \
     *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi1;              \
     *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi1;              \
-    ic_store += N_BANKS;                                                       \
+    ic_store += NUM_BANKS;                                                     \
     *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi2;                             \
     *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi2;              \
     *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi2;              \
     *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi2;              \
-    ic_store += N_BANKS;                                                       \
+    ic_store += NUM_BANKS;                                                     \
     *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi3;                             \
     *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi3;              \
     *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi3;              \
@@ -315,8 +314,8 @@ void mempool_radix4_cfft_f16p_scheduler(
     LOAD_STORE_TWIDDLEFACT;
     SHUFFLE_TWIDDLEFACT;
     for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-      pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen;
-      pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+      pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * fftLen;
+      pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4);
       radix4_butterfly_first(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
                              C3);
     }
@@ -345,8 +344,8 @@ void mempool_radix4_cfft_f16p_scheduler(
       SHUFFLE_TWIDDLEFACT;
 
       for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-        pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
-        pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+        pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4);
+        pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4);
         radix4_butterfly_middle(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
                                 C3);
       }
@@ -370,8 +369,8 @@ void mempool_radix4_cfft_f16p_scheduler(
       uint32_t col_shift = fftLen / 4;
 #endif
 
-      pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
-      pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * col_shift;
+      pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4);
+      pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * col_shift;
       radix4_butterfly_last(pIn, pOut, i0);
     }
   }
@@ -416,7 +415,7 @@ void mempool_radix4_cfft_f16p_scheduler(
                    : [s2] "r"(s2)
                    :);
       for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
-        uint16_t *ptr = (uint16_t *)(pIn + idx_row * (N_BANKS * 8));
+        uint16_t *ptr = (uint16_t *)(pIn + idx_row * (NUM_BANKS * 8));
         // Load at address a
         tmpa1 = *(uint32_t *)&ptr[a1];
         tmpa2 = *(uint32_t *)&ptr[a2];
@@ -465,12 +464,12 @@ void mempool_radix4_cfft_f16p_scheduler(
         idx3 = idx3 >> 1U;
       }
       idx0 = ic / 4;
-      idx1 = ic / 4 + N_BANKS;
-      idx2 = ic / 4 + 2 * N_BANKS;
-      idx3 = ic / 4 + 3 * N_BANKS;
+      idx1 = ic / 4 + NUM_BANKS;
+      idx2 = ic / 4 + 2 * NUM_BANKS;
+      idx3 = ic / 4 + 3 * NUM_BANKS;
       for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-        ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (N_BANKS * 8);
-        ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (N_BANKS * 8);
+        ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (NUM_BANKS * 8);
+        ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (NUM_BANKS * 8);
         *((uint32_t *)&ptr2[2 * idx_result0]) = *((uint32_t *)&ptr1[2 * idx0]);
         *((uint32_t *)&ptr2[2 * idx_result1]) = *((uint32_t *)&ptr1[2 * idx1]);
         *((uint32_t *)&ptr2[2 * idx_result2]) = *((uint32_t *)&ptr1[2 * idx2]);
diff --git a/software/kernels/baremetal/mempool_radix4_cfft_q16p.h b/software/kernels/baremetal/mempool_radix4_cfft_q16p.h
index e91602866..ef6122a00 100644
--- a/software/kernels/baremetal/mempool_radix4_cfft_q16p.h
+++ b/software/kernels/baremetal/mempool_radix4_cfft_q16p.h
@@ -45,19 +45,19 @@
 
 #define LOAD_STORE_TWIDDLEFACT                                                 \
   CoSi1 = *(v2s *)&pCoef_src[2U * ic];                                         \
-  CoSi2 = *(v2s *)&pCoef_src[2U * (ic + 1 * N_BANKS)];                         \
-  CoSi3 = *(v2s *)&pCoef_src[2U * (ic + 2 * N_BANKS)];                         \
+  CoSi2 = *(v2s *)&pCoef_src[2U * (ic + 1 * NUM_BANKS)];                       \
+  CoSi3 = *(v2s *)&pCoef_src[2U * (ic + 2 * NUM_BANKS)];                       \
   if (ic % 4 == 0) {                                                           \
     *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi1;                             \
     *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi1;              \
     *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi1;              \
     *((v2s *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi1;              \
-    ic_store += N_BANKS;                                                       \
+    ic_store += NUM_BANKS;                                                     \
     *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi2;                             \
     *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi2;              \
     *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi2;              \
     *((v2s *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi2;              \
-    ic_store += N_BANKS;                                                       \
+    ic_store += NUM_BANKS;                                                     \
     *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi3;                             \
     *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi3;              \
     *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi3;              \
@@ -226,16 +226,16 @@ static inline void fold_radix4(int16_t *pSrc16, uint32_t fftLen,
     i1 = i0 + n2;
     i2 = i1 + n2;
     i3 = i2 + n2;
-    i1_store = i0 + N_BANKS;
-    i2_store = i1_store + N_BANKS;
-    i3_store = i2_store + N_BANKS;
+    i1_store = i0 + NUM_BANKS;
+    i2_store = i1_store + NUM_BANKS;
+    i3_store = i2_store + NUM_BANKS;
     for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-      A = *(v2s *)&pSrc16[i1 * 2U + idx_row * (8 * N_BANKS)];
-      B = *(v2s *)&pSrc16[i2 * 2U + idx_row * (8 * N_BANKS)];
-      C = *(v2s *)&pSrc16[i3 * 2U + idx_row * (8 * N_BANKS)];
-      *(v2s *)&pSrc16[i1_store * 2U + idx_row * (8 * N_BANKS)] = A;
-      *(v2s *)&pSrc16[i2_store * 2U + idx_row * (8 * N_BANKS)] = B;
-      *(v2s *)&pSrc16[i3_store * 2U + idx_row * (8 * N_BANKS)] = C;
+      A = *(v2s *)&pSrc16[i1 * 2U + idx_row * (8 * NUM_BANKS)];
+      B = *(v2s *)&pSrc16[i2 * 2U + idx_row * (8 * NUM_BANKS)];
+      C = *(v2s *)&pSrc16[i3 * 2U + idx_row * (8 * NUM_BANKS)];
+      *(v2s *)&pSrc16[i1_store * 2U + idx_row * (8 * NUM_BANKS)] = A;
+      *(v2s *)&pSrc16[i2_store * 2U + idx_row * (8 * NUM_BANKS)] = B;
+      *(v2s *)&pSrc16[i3_store * 2U + idx_row * (8 * NUM_BANKS)] = C;
     }
   }
   mempool_log_partial_barrier(2, absolute_core_id, nPE);
@@ -426,8 +426,8 @@ void mempool_radix4_cfft_q16p_scheduler(
     LOAD_STORE_TWIDDLEFACT;
     SHUFFLE_TWIDDLEFACT;
     for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-      pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen;
-      pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+      pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * fftLen;
+      pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4);
       radix4_butterfly_first(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
                              C3);
     }
@@ -460,8 +460,8 @@ void mempool_radix4_cfft_q16p_scheduler(
       SHUFFLE_TWIDDLEFACT;
 
       for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-        pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
-        pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+        pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4);
+        pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4);
         radix4_butterfly_middle(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
                                 C3);
       }
@@ -489,8 +489,8 @@ void mempool_radix4_cfft_q16p_scheduler(
       uint32_t col_shift = fftLen / 4;
 #endif
 
-      pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
-      pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * col_shift;
+      pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4);
+      pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * col_shift;
       radix4_butterfly_last(pIn, pOut, i0);
     }
   }
@@ -535,7 +535,7 @@ void mempool_radix4_cfft_q16p_scheduler(
                    : [s2] "r"(s2)
                    :);
       for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
-        uint16_t *ptr = (uint16_t *)(pIn + idx_row * (N_BANKS * 8));
+        uint16_t *ptr = (uint16_t *)(pIn + idx_row * (NUM_BANKS * 8));
         // Load at address a
         tmpa1 = *(uint32_t *)&ptr[a1];
         tmpa2 = *(uint32_t *)&ptr[a2];
@@ -584,12 +584,12 @@ void mempool_radix4_cfft_q16p_scheduler(
         idx3 = idx3 >> 1U;
       }
       idx0 = ic / 4;
-      idx1 = ic / 4 + N_BANKS;
-      idx2 = ic / 4 + 2 * N_BANKS;
-      idx3 = ic / 4 + 3 * N_BANKS;
+      idx1 = ic / 4 + NUM_BANKS;
+      idx2 = ic / 4 + 2 * NUM_BANKS;
+      idx3 = ic / 4 + 3 * NUM_BANKS;
       for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-        ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (N_BANKS * 8);
-        ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (N_BANKS * 8);
+        ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (NUM_BANKS * 8);
+        ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (NUM_BANKS * 8);
         *((uint32_t *)&ptr2[2 * idx_result0]) = *((uint32_t *)&ptr1[2 * idx0]);
         *((uint32_t *)&ptr2[2 * idx_result1]) = *((uint32_t *)&ptr1[2 * idx1]);
         *((uint32_t *)&ptr2[2 * idx_result2]) = *((uint32_t *)&ptr1[2 * idx2]);
diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk
index 2602804cc..94f822ddc 100644
--- a/software/runtime/runtime.mk
+++ b/software/runtime/runtime.mk
@@ -27,7 +27,7 @@ DATA_DIR           ?= $(abspath $(ROOT_DIR)/../data)
 COMPILER      ?= gcc
 XPULPIMG      ?= $(xpulpimg)
 ZFINX         ?= $(zfinx)
-XDIVSQRT	  ?= $(xDivSqrt)
+XDIVSQRT	    ?= $(xDivSqrt)
 
 RISCV_XLEN    ?= 32
 
@@ -92,6 +92,7 @@ DEFINES += -DNUM_CORES=$(num_cores)
 DEFINES += -DNUM_GROUPS=$(num_groups)
 DEFINES += -DNUM_CORES_PER_TILE=$(num_cores_per_tile)
 DEFINES += -DBANKING_FACTOR=$(banking_factor)
+DEFINES += -DNUM_BANKS=$(shell awk 'BEGIN{print $(banking_factor)*$(num_cores)}')
 DEFINES += -DNUM_CORES_PER_GROUP=$(shell awk 'BEGIN{print $(num_cores)/$(num_groups)}')
 DEFINES += -DNUM_TILES_PER_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_cores_per_tile)}')
 DEFINES += -DLOG2_NUM_CORES_PER_TILE=$(shell awk 'BEGIN{print log($(num_cores_per_tile))/log(2)}')