diff --git a/software/apps/baremetal/Makefile b/software/apps/baremetal/Makefile
index 9511f7869..bb640dfde 100644
--- a/software/apps/baremetal/Makefile
+++ b/software/apps/baremetal/Makefile
@@ -20,18 +20,13 @@ APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find $(APPS_DIR) -name "main.c
 BINARIES := $(addprefix $(BIN_DIR)/,$(APPS))
 ALL := $(APPS)
 
-FP_APPS := axpy_f16 axpy_f32
-FP_APPS += cfft_radix4_f16 chest_f16 cholesky_f16
-FP_APPS += cmatmul_f16 matmul_f16 matmul_f32
-FP_APPS += dotp_f16 dotp_f32
-FP_APPS += mimo_mmse_f32 mimo_mmse_f16 mimo_mmse_f8 ofdm_f16
-
-I_APPS := synth_i32
-I_APPS += cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32
-I_APPS += cmatmul_q16 mimo_mmse_q16
-
-ALL_GCC := $(filter-out $(FP_APPS), $(ALL))
-ALL_LLVM := $(filter-out $(I_APPS), $(ALL))
+FP_SUFFIXES := f16 f32 f8
+I_SUFFIXES := q16 q32 i16 i32 i8
+I_APPS := $(foreach suf, $(FP_SUFFIXES), $(filter %_$(suf), $(ALL)))
+FP_APPS := $(foreach suf, $(I_SUFFIXES), $(filter %_$(suf), $(ALL)))
+# Filter out applications
+ALL_GCC := $(filter-out $(I_APPS), $(ALL))
+ALL_LLVM := $(filter-out $(FP_APPS), $(ALL))
 
 # Make all applications
 all: $(ALL_GCC)
diff --git a/software/apps/baremetal/axpy_f16/main.c b/software/apps/baremetal/axpy_f16/main.c
index 1795e9059..8bcb38296 100644
--- a/software/apps/baremetal/axpy_f16/main.c
+++ b/software/apps/baremetal/axpy_f16/main.c
@@ -15,7 +15,6 @@
 #include "synchronization.h"
 
 #include "data_axpy_f16.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // Vectors for kernel computation
 __fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
diff --git a/software/apps/baremetal/axpy_f32/main.c b/software/apps/baremetal/axpy_f32/main.c
index 34ead109c..cb3f1d8a9 100644
--- a/software/apps/baremetal/axpy_f32/main.c
+++ b/software/apps/baremetal/axpy_f32/main.c
@@ -15,7 +15,6 @@
 #include "synchronization.h"
 
 #include "data_axpy_f32.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // Vectors for kernel computation
 float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
diff --git a/software/apps/baremetal/cfft_radix2_q16/main.c b/software/apps/baremetal/cfft_radix2_q16/main.c
index e23fb929e..36646f877 100644
--- a/software/apps/baremetal/cfft_radix2_q16/main.c
+++ b/software/apps/baremetal/cfft_radix2_q16/main.c
@@ -19,7 +19,7 @@
 #include "synchronization.h"
 
 #include "data_cfft_radix2_q16.h"
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
+#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 
 /* CFFT mempool libraries */
 #include "baremetal/mempool_cfft_q16_bitreversal.h"
diff --git a/software/apps/baremetal/cfft_radix4_f16/main.c b/software/apps/baremetal/cfft_radix4_f16/main.c
index b06ae3189..ced0ffc6a 100644
--- a/software/apps/baremetal/cfft_radix4_f16/main.c
+++ b/software/apps/baremetal/cfft_radix4_f16/main.c
@@ -19,25 +19,31 @@
 
 /* CFFT data libraries */
 #include "data_cfft_radix4_f16.h"
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
+#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
+#define MAX_COL (NUM_BANKS / (N_CSAMPLES / 4))
 
-/* CHOOSE ONE */
-#define PARALLEL // Parallel FFT not "memory-aware".
-// #define FOLDED // Parallel FFT with "memory-aware" load/store.
-//#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
+/*
+======================
+Parameters and defines
 
-// Bitreversal index from table.
+PARALLEL: When defined runs parallel FFT.
+FOLDED: When defined runs parallel FFT with folded inputs in memory.
+SCHEDULED: When defined runs multiple parallel folded-inputs FFTs.
+N_FFTs_ROW: When the FFT is scheduled defines the number of FFTs run sequntially
+by each core N_FFTs_COL:
+
+BITREVERSETABLE: When defined bitreversal indeces are fetched from a table, else
+they are computed by cores. FOLDED_TWIDDLES: When FOLDED is defined it can be
+defined to also fold the twiddle factors in memory.
+*/
+
+#define PARALLEL
 #define BITREVERSETABLE
-// Also the twiddles have "memory-aware" load/stores.
-// #define FOLDED_TWIDDLES
 
-// Independent FFTs scheduled on one row (default 1).
-#define N_FFTs_ROW 1
-// Independent FFTs scheduled on columns (default 1).
-#define N_FFTs_COL 1
+#define N_FFTs_ROW (1)
+#define N_FFTs_COL (1)
 #if (N_FFTs_COL > MAX_COL)
-#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
+#error Parallelization not supporting N_FFTs_COL > [NUM_BANKS / (N_CSAMPLES / 4)]
 #endif
 
 #include "baremetal/mempool_cfft_q16_bitreversal.h"
@@ -59,16 +65,16 @@ uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
 #endif
 
 #if (defined(SCHEDULED) || defined(FOLDED))
-__fp16 l1_pSrc[N_FFTs_ROW * 8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-__fp16 l1_pDst[N_FFTs_ROW * 8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-__fp16 l1_twiddleCoef_f16_src[8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-__fp16 l1_twiddleCoef_f16_dst[8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+__fp16 l1_pSrc[N_FFTs_ROW * 8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+__fp16 l1_pDst[N_FFTs_ROW * 8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_src[8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_dst[8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 #endif
 
 int main() {
@@ -96,7 +102,7 @@ int main() {
   if (core_id == 0) {
     for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
       for (uint32_t i = 0; i < N_FFTs_COL; i++) {
-        dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS),
+        dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * NUM_BANKS),
                             l2_pSrc, N_CSAMPLES * sizeof(int32_t));
       }
     }
@@ -113,9 +119,11 @@ int main() {
     for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
       *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL)] =
           *(v2h *)&l2_twiddleCoef_f16[2 * i];
-      *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] =
+      *(v2h *)&l1_twiddleCoef_f16_src[2 *
+                                      (i + j * N_WORDS_COL + 1 * NUM_BANKS)] =
           *(v2h *)&l2_twiddleCoef_f16[2 * (i * 2U)];
-      *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] =
+      *(v2h *)&l1_twiddleCoef_f16_src[2 *
+                                      (i + j * N_WORDS_COL + 2 * NUM_BANKS)] =
           *(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)];
     }
   }
diff --git a/software/apps/baremetal/cfft_radix4_q16/main.c b/software/apps/baremetal/cfft_radix4_q16/main.c
index 08ed80e9b..29b84c950 100644
--- a/software/apps/baremetal/cfft_radix4_q16/main.c
+++ b/software/apps/baremetal/cfft_radix4_q16/main.c
@@ -19,23 +19,31 @@
 
 /* CFFT data libraries */
 #include "data_cfft_radix4_q16.h"
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
+#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
+#define MAX_COL (NUM_BANKS / (N_CSAMPLES / 4))
 
-/* CHOOSE ONE */
-//#define SINGLE // Single core FFT.
-//#define PARALLEL // Parallel FFT not "memory-aware".
-//#define FOLDED // Parallel FFT with "memory-aware" load/store.
-#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
+/*
+======================
+Parameters and defines
+
+PARALLEL: When defined runs parallel FFT.
+FOLDED: When defined runs parallel FFT with folded inputs in memory.
+SCHEDULED: When defined runs multiple parallel folded-inputs FFTs.
+N_FFTs_ROW: When the FFT is scheduled defines the number of FFTs run sequntially
+by each core N_FFTs_COL:
+
+BITREVERSETABLE: When defined bitreversal indeces are fetched from a table, else
+they are computed by cores. FOLDED_TWIDDLES: When FOLDED is defined it can be
+defined to also fold the twiddle factors in memory.
+*/
 
-// Bitreversal index from table.
+#define PARALLEL
 #define BITREVERSETABLE
-// Independent FFTs scheduled on one row (default 1).
-#define N_FFTs_ROW 2
-// Independent FFTs scheduled on columns (default 1).
-#define N_FFTs_COL 2
+
+#define N_FFTs_ROW (1)
+#define N_FFTs_COL (1)
 #if (N_FFTs_COL > MAX_COL)
-#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
+#error Parallelization not supporting N_FFTs_COL > [NUM_BANKS / (N_CSAMPLES / 4)]
 #endif
 // Also the twiddles have "memory-aware" load/stores.
 #define FOLDED_TWIDDLES
@@ -60,16 +68,16 @@ uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
 #endif
 
 #if (defined(SCHEDULED) || defined(FOLDED))
-int16_t l1_pSrc[N_FFTs_ROW * 8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-int16_t l1_pDst[N_FFTs_ROW * 8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-int16_t l1_twiddleCoef_q16_src[8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-int16_t l1_twiddleCoef_q16_dst[8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+int16_t l1_pSrc[N_FFTs_ROW * 8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+int16_t l1_pDst[N_FFTs_ROW * 8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+int16_t l1_twiddleCoef_q16_src[8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+int16_t l1_twiddleCoef_q16_dst[8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 #endif
 
 int main() {
@@ -97,7 +105,7 @@ int main() {
   if (core_id == 0) {
     for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
       for (uint32_t i = 0; i < N_FFTs_COL; i++) {
-        dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS),
+        dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * NUM_BANKS),
                             l2_pSrc, N_CSAMPLES * sizeof(int32_t));
       }
     }
@@ -112,9 +120,11 @@ int main() {
     for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
       *(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL)] =
           *(v2s *)&l2_twiddleCoef_q16[2 * i];
-      *(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] =
+      *(v2s *)&l1_twiddleCoef_q16_src[2 *
+                                      (i + j * N_WORDS_COL + 1 * NUM_BANKS)] =
           *(v2s *)&l2_twiddleCoef_q16[2 * (i * 2U)];
-      *(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] =
+      *(v2s *)&l1_twiddleCoef_q16_src[2 *
+                                      (i + j * N_WORDS_COL + 2 * NUM_BANKS)] =
           *(v2s *)&l2_twiddleCoef_q16[2 * (i * 3U)];
     }
   }
diff --git a/software/apps/baremetal/chest_f16/main.c b/software/apps/baremetal/chest_f16/main.c
index e0feb90c7..304313788 100644
--- a/software/apps/baremetal/chest_f16/main.c
+++ b/software/apps/baremetal/chest_f16/main.c
@@ -19,6 +19,14 @@
 #include "baremetal/mempool_chest_f16.h"
 #include "data_chest_f16.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core Channel Estimation.
+PARALLEL: When defined runs parallel Channel Estimation.
+*/
+
 //#define SINGLE
 #define PARALLEL
 
diff --git a/software/apps/baremetal/chest_q16/main.c b/software/apps/baremetal/chest_q16/main.c
index 572b12de0..6f7a73938 100644
--- a/software/apps/baremetal/chest_q16/main.c
+++ b/software/apps/baremetal/chest_q16/main.c
@@ -19,6 +19,14 @@
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_chest_q16.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core Channel Estimation.
+PARALLEL: When defined runs parallel Channel Estimation.
+*/
+
 #define PARALLEL
 
 int16_t l1_PilotTX[2 * N_TX * N_SAMPLES]
diff --git a/software/apps/baremetal/cholesky_f16/main.c b/software/apps/baremetal/cholesky_f16/main.c
index 6d1c26ff2..10baa6a81 100644
--- a/software/apps/baremetal/cholesky_f16/main.c
+++ b/software/apps/baremetal/cholesky_f16/main.c
@@ -17,6 +17,15 @@
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_cholesky_f16s.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core Cholesky Decomposition.
+PARALLEL: When defined runs parallel Cholesky Decomposition.
+FOLDED: When defined 1 intermediate results are folded in memory.
+*/
+
 #define SINGLE
 #define FOLDED (0)
 
diff --git a/software/apps/baremetal/cholesky_q32/main.c b/software/apps/baremetal/cholesky_q32/main.c
index 64fbf3b2f..e0daba421 100644
--- a/software/apps/baremetal/cholesky_q32/main.c
+++ b/software/apps/baremetal/cholesky_q32/main.c
@@ -11,7 +11,7 @@
 #include "synchronization.h"
 
 #define HALF (1023)
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
+#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 #define FIX_DIV(a, b) ((int32_t)((a << FIXED_POINT) / b))
 #define FIX_MUL(a, b) ((int32_t)((a * b + HALF) >> FIXED_POINT))
 #define ABS(a) (a > 0 ? a : -a)
@@ -31,18 +31,19 @@
 #define N_COL 1
 #define N_ROW 1
 int32_t l1_A[matrix_N * matrix_N]
-    __attribute__((aligned(N_BANKS), section(".l1")));
+    __attribute__((aligned(NUM_BANKS), section(".l1")));
 int32_t l1_L[matrix_N * matrix_N]
-    __attribute__((aligned(N_BANKS), section(".l1")));
-int32_t l1_y[matrix_N] __attribute__((aligned(N_BANKS), section(".l1")));
+    __attribute__((aligned(NUM_BANKS), section(".l1")));
+int32_t l1_y[matrix_N] __attribute__((aligned(NUM_BANKS), section(".l1")));
 #else
-int32_t l1_AA[matrix_N * N_BANKS]
-    __attribute__((aligned(N_BANKS), section(".l1_prio")));
-int32_t l1_LL[N_ROW * matrix_N * N_BANKS]
-    __attribute__((aligned(N_BANKS), section(".l1_prio")));
-int32_t l1_LR[N_ROW * matrix_N * N_BANKS]
-    __attribute__((aligned(N_BANKS), section(".l1_prio")));
-int32_t l1_yy[N_BANKS] __attribute__((aligned(N_BANKS), section(".l1_prio")));
+int32_t l1_AA[matrix_N * NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+int32_t l1_LL[N_ROW * matrix_N * NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+int32_t l1_LR[N_ROW * matrix_N * NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+int32_t l1_yy[NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 #endif
 
 int main() {
@@ -58,11 +59,12 @@ int main() {
       for (uint32_t idx_col = 0; idx_col < N_COL; idx_col++) {
         l1_yy[idx_col * matrix_N + i] = l2_y[i];
         for (uint32_t j = 0; j < matrix_N; j++) {
-          l1_AA[idx_col * matrix_N + i * N_BANKS + j] = l2_A[i * matrix_N + j];
+          l1_AA[idx_col * matrix_N + i * NUM_BANKS + j] =
+              l2_A[i * matrix_N + j];
         }
       }
     }
-    for (uint32_t i = 0; i < N_ROW * matrix_N * N_BANKS; i++) {
+    for (uint32_t i = 0; i < N_ROW * matrix_N * NUM_BANKS; i++) {
       l1_LL[i] = 0;
       l1_LR[i] = 0;
     }
diff --git a/software/apps/baremetal/cmatmul_f16/main.c b/software/apps/baremetal/cmatmul_f16/main.c
index aa2ed55a6..727dba7ca 100644
--- a/software/apps/baremetal/cmatmul_f16/main.c
+++ b/software/apps/baremetal/cmatmul_f16/main.c
@@ -19,7 +19,18 @@
 
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_cmatmul_f16.h"
-#define PARALLEL_4x4
+
+/*
+======================
+Parameters and defines
+
+SINGLE_2x2: Single-core matmul on 2x2 tiles.
+PARALLEL_2x2: Parallel matmul on 2x2 C-tiles.
+PARALLEL_2x4: Parallel matmul on 4x4 C-tiles.
+PARALLEL_4x4: Parallel matmul on 4x4 C-tiles.
+PARALLEL_4x4_COPIES_A: Parallel matmul on 4x4 C-tiles, compies of A in memory to
+avoid banking conflicts.
+*/
 
 #if defined(PARALLEL_4x4_COPIES_A)
 __fp16 matrix_a[2 * (BANKING_FACTOR * NUM_CORES)]
@@ -51,7 +62,7 @@ int main() {
   // Wait at barrier until everyone is ready
   mempool_barrier(num_cores);
 
-#if defined(SINGLE_CORE)
+#if defined(SINGLE_2x2)
   // Execute function to test.
   if (core_id == 0) {
     mempool_start_benchmark();
diff --git a/software/apps/baremetal/cmatmul_q16/main.c b/software/apps/baremetal/cmatmul_q16/main.c
index 0dcffbfc7..37089fd5b 100644
--- a/software/apps/baremetal/cmatmul_q16/main.c
+++ b/software/apps/baremetal/cmatmul_q16/main.c
@@ -16,6 +16,14 @@
 #include "baremetal/mempool_cmatmul_q16.h"
 #include "data_cmatmul_q16.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core matmul.
+PARALLEL: When defined runs parallel matmul.
+*/
+
 #define PARALLEL
 #define dim_M (matrix_M)
 #define dim_N (matrix_N)
diff --git a/software/apps/baremetal/dotp_f16/main.c b/software/apps/baremetal/dotp_f16/main.c
index 2091f0336..3b8b272b9 100644
--- a/software/apps/baremetal/dotp_f16/main.c
+++ b/software/apps/baremetal/dotp_f16/main.c
@@ -14,9 +14,6 @@
 #include "synchronization.h"
 
 #include "data_dotp_f16.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
-// #define SINGLE_CORE_REDUCTION
-#define BINARY_REDUCTION
 
 // Vectors for kernel computation
 __fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
@@ -47,18 +44,6 @@ int main() {
   }
   mempool_barrier(num_cores);
 
-  //  // SINGLE-CORE
-  //  time_init = mempool_get_timer();
-  //  dotp_f16s(l1_X, l1_Y, sum, array_N);
-  //  // dotp_f16s_unrolled4(l1_X, l1_Y, sum, array_N);
-  //  time_end = mempool_get_timer();
-
-  //  // PARALLEL
-  //  time_init = mempool_get_timer();
-  //  dotp_f16vecp_unrolled4(l1_X, l1_Y, sum, array_N, num_cores);
-  //  // dotp_f16p(l1_X, l1_Y, sum, array_N, num_cores);
-  //  time_end = mempool_get_timer();
-
   // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
   dotp_f16vecp_local_unrolled4(l1_X, l1_Y, sum, array_N);
diff --git a/software/apps/baremetal/dotp_f32/main.c b/software/apps/baremetal/dotp_f32/main.c
index 3507795b1..e1a87b6b8 100644
--- a/software/apps/baremetal/dotp_f32/main.c
+++ b/software/apps/baremetal/dotp_f32/main.c
@@ -15,9 +15,6 @@
 #include "synchronization.h"
 
 #include "data_dotp_f32.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
-// #define SINGLE_CORE_REDUCTION
-#define BINARY_REDUCTION
 
 // Vectors for kernel computation
 float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
@@ -47,16 +44,6 @@ int main() {
   }
   mempool_barrier(num_cores);
 
-  //    // SINGLE-CORE
-  //    time_init = mempool_get_timer();
-  //    dotp_f32s_unrolled4(l1_A, l1_B, sum, array_N);
-  //    time_end = mempool_get_timer();
-
-  //   // PARALLEL
-  //   time_init = mempool_get_timer();
-  //   dotp_f32p(l1_A, l1_B, sum, array_N, num_cores);
-  //   time_end = mempool_get_timer();
-
   // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
   dotp_f32p_local_unrolled4(l1_X, l1_Y, sum, array_N);
diff --git a/software/apps/baremetal/dotp_i32/main.c b/software/apps/baremetal/dotp_i32/main.c
index ee2e2ea52..8f6490ee2 100644
--- a/software/apps/baremetal/dotp_i32/main.c
+++ b/software/apps/baremetal/dotp_i32/main.c
@@ -15,11 +15,6 @@
 #include "synchronization.h"
 
 #include "data_dotp_i32.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
-#define LOG_BARRIERS
-// #define ATOMIC_REDUCTION
-// #define SINGLE_CORE_REDUCTION
-#define BINARY_REDUCTION
 
 // Vectors for kernel computation
 int32_t l1_X[array_N] __attribute__((aligned(array_N), section(".l1_prio")));
@@ -49,16 +44,6 @@ int main() {
   }
   mempool_barrier(num_cores);
 
-  //  // SINGLE-CORE
-  //  time_init = mempool_get_timer();
-  //  dotp_i32s_unrolled4(l1_A, l1_B, sum, array_N);
-  //  time_end = mempool_get_timer();
-
-  //  // PARALLEL
-  //  time_init = mempool_get_timer();
-  //  dotp_i32p(l1_A, l1_B, sum, array_N, num_cores);
-  //  time_end = mempool_get_timer();
-
   // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
   dotp_i32p_local_unrolled4(l1_X, l1_Y, sum, array_N);
diff --git a/software/apps/baremetal/matmul_f16/main.c b/software/apps/baremetal/matmul_f16/main.c
index 99a0269cc..9964257ca 100644
--- a/software/apps/baremetal/matmul_f16/main.c
+++ b/software/apps/baremetal/matmul_f16/main.c
@@ -17,7 +17,13 @@
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_matmul_f16.h"
 
-#define PARALLEL
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core matmul.
+PARALLEL: When defined runs parallel matmul.
+*/
 
 __fp16 matrix_a[matrix_M * matrix_N]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
diff --git a/software/apps/baremetal/matmul_f32/main.c b/software/apps/baremetal/matmul_f32/main.c
index d3d7622db..ba9165ed1 100644
--- a/software/apps/baremetal/matmul_f32/main.c
+++ b/software/apps/baremetal/matmul_f32/main.c
@@ -17,8 +17,13 @@
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_matmul_f32.h"
 
-#define PARALLEL
-#define ASM
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core matmul.
+PARALLEL: When defined runs parallel matmul.
+*/
 
 float matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
 float matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
diff --git a/software/apps/baremetal/mimo_mmse_f16/main.c b/software/apps/baremetal/mimo_mmse_f16/main.c
index 80309a1e0..b1ef24451 100644
--- a/software/apps/baremetal/mimo_mmse_f16/main.c
+++ b/software/apps/baremetal/mimo_mmse_f16/main.c
@@ -18,25 +18,45 @@
 #include "baremetal/mempool_mimo_mmse_f16s.h"
 
 #include "data_mimo_mmse_f16.h"
-#define ZF (0)   // When asserted use zero-forcing
-#define FOLD (1) // When asserted fold matrices in memory
-#define NUM_BANKS (BANKING_FACTOR * NUM_CORES)
+
+/*
+======================
+Parameters and defines
+
+DOUBLE_BUFFERING: When defined benchmark double buffered MIMO-MMSE, including
+L2-L1 transfers.
+
+For MIMO-MMSE without L2-L1 transfers:
+PARALLEL: When defined benchmark parallel MIMO-MMSE.
+SINGLE: When defined benchmark single-core MIMO-MMSE.
+VEC: When defined benchmark SIMD-vectorized kernels.
+ZF: When defined 1 use zero forcing detector.
+FOLD: When defined 1 fold matrices in memory.
+
+For MIMO-MMSE with L2-L1 transfers:
+DMA_TRANSFER1: When defined transfer inputs for next round at the beginning of
+computation. DMA_TRANSFER2: When defined transfer inputs for next round after
+Hermitian computation. N_ROUNDS: Define number of rounds of Double-Buffering.
+*/
+
+#define ZF (0)
+#define FOLD (1)
 #define PARALLEL
 #define VEC
 
+#ifndef DOUBLE_BUFFERING
+
 /**********************************************************
  **********************************************************
-  _   _  ___        _     _ _____                     __
- | \ | |/ _ \      | |   / |_   _| __ __ _ _ __  ___ / _|
- |  \| | | | |_____| |   | | | || '__/ _` | '_ \/ __| |_
- | |\  | |_| |_____| |___| | | || | | (_| | | | \__ \  _|
- |_| \_|\___/      |_____|_| |_||_|  \__,_|_| |_|___/_|(_)
+  _   _  ___        _____                     __
+ | \ | |/ _ \      |_   _| __ __ _ _ __  ___ / _|
+ |  \| | | | |_____  | || '__/ _` | '_ \/ __| |_
+ | |\  | |_| |_____  | || | | (_| | | | \__ \  _|
+ |_| \_|\___/        |_||_|  \__,_|_| |_|___/_|(_)
 
 ***********************************************************
 ***********************************************************/
 
-#ifndef DOUBLE_BUFFERING
-
 #if FOLD
 #define NUM_ROW (1 + ((N_ITR * N_TX - 1) / NUM_BANKS))
 #define NUM_COL (NUM_BANKS / N_TX)
@@ -193,6 +213,8 @@ int main() {
   return 0;
 }
 
+#else
+
 /**********************************************************
  **********************************************************
   ____  __  __    _       _____                     __
@@ -204,10 +226,6 @@ int main() {
 ***********************************************************
 ***********************************************************/
 
-#else
-#define N_ROUNDS (1)
-#define DMA_TRANSFER1
-
 // Inputs-Outputs even double-buffering rounds
 __fp16 l1A_H[2 * N_TX * N_RX * N_ITR]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
diff --git a/software/apps/baremetal/mimo_mmse_f32/main.c b/software/apps/baremetal/mimo_mmse_f32/main.c
index d243754fc..fb054e4e0 100644
--- a/software/apps/baremetal/mimo_mmse_f32/main.c
+++ b/software/apps/baremetal/mimo_mmse_f32/main.c
@@ -21,6 +21,17 @@
 
 #include "data_mimo_mmse_f32.h"
 
+/*
+======================
+Parameters and defines
+
+PARALLEL: When defined benchmark parallel MIMO-MMSE.
+SINGLE: When defined benchmark single-core MIMO-MMSE.
+PARALLEL_HERMITIAN: When defined the Hermitian is finely-grained parallelized
+over a group of cores. ZF: When defined 1 use zero forcing detector. FOLD: When
+defined 1 fold matrices in memory.
+*/
+
 #define SINGLE
 #define ZF (0)
 #define FOLD (0)
diff --git a/software/apps/baremetal/mimo_mmse_f8/main.c b/software/apps/baremetal/mimo_mmse_f8/main.c
index c5d1cd77e..006dbf83b 100644
--- a/software/apps/baremetal/mimo_mmse_f8/main.c
+++ b/software/apps/baremetal/mimo_mmse_f8/main.c
@@ -18,9 +18,20 @@
 #include "baremetal/mempool_mimo_mmse_f8s.h"
 
 #include "data_mimo_mmse_f8.h"
+
+/*
+======================
+Parameters and defines
+
+PARALLEL: When defined benchmark parallel MIMO-MMSE.
+SINGLE: When defined benchmark single-core MIMO-MMSE.
+VEC: When defined benchmark SIMD-vectorized kernels.
+ZF: When defined 1 use zero forcing detector.
+FOLD: When defined 1 fold matrices in memory.
+*/
+
 #define ZF (0)   // When asserted use zero-forcing
 #define FOLD (0) // When asserted fold matrixes in memory
-#define NUM_BANKS (BANKING_FACTOR * NUM_CORES)
 #define PARALLEL
 #define VEC
 
diff --git a/software/apps/baremetal/mimo_mmse_q16/main.c b/software/apps/baremetal/mimo_mmse_q16/main.c
index 9bcb5e9db..8e2b557a4 100644
--- a/software/apps/baremetal/mimo_mmse_q16/main.c
+++ b/software/apps/baremetal/mimo_mmse_q16/main.c
@@ -16,7 +16,13 @@
 #include "baremetal/mempool_linearsolver_q16s.h"
 #include "baremetal/mempool_mimo_mmse_q16s.h"
 
-#define PARALLEL
+/*
+======================
+Parameters and defines
+
+PARALLEL: When defined benchmark parallel MIMO-MMSE.
+SINGLE: When defined benchmark single-core MIMO-MMSE.
+*/
 
 int16_t l1_H[2 * N_TX * N_RX * N_ITR]
     __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
diff --git a/software/apps/baremetal/ofdm_f16/main.c b/software/apps/baremetal/ofdm_f16/main.c
index 264768199..3cf04dbed 100644
--- a/software/apps/baremetal/ofdm_f16/main.c
+++ b/software/apps/baremetal/ofdm_f16/main.c
@@ -18,7 +18,6 @@
 #include "synchronization.h"
 
 #include "data_ofdm_f16.h"
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // CFFT Parameters
 #define SCHEDULED
@@ -28,7 +27,7 @@
 #define N_FFTs_COL 4
 #define N_FFTs_ROW (N_RX / N_FFTs_COL)
 // CMATMUL Parameters
-#define NUM_COPIES (N_BANKS / (N_BEAMS * N_RX))
+#define NUM_COPIES (NUM_BANKS / (N_BEAMS * N_RX))
 #define dim_M (N_BEAMS)
 #define dim_N (N_RX)
 #define dim_P (N_SC)
@@ -43,18 +42,18 @@ dump(checkpoint, 1);
 
 uint32_t arrival_index __attribute__((section(".l1_prio")));
 __fp16 l1_pBF_Coef_folded[2 * BANKING_FACTOR * NUM_CORES]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 
-__fp16 l1_pFFT_Src[N_FFTs_ROW * 8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-__fp16 l1_pFFT_Dst[N_FFTs_ROW * 8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-__fp16 l1_twiddleCoef_f16_src[6 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-__fp16 l1_twiddleCoef_f16_dst[6 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+__fp16 l1_pFFT_Src[N_FFTs_ROW * 8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+__fp16 l1_pFFT_Dst[N_FFTs_ROW * 8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_src[6 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_dst[6 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 /* MAIN */
@@ -67,7 +66,7 @@ int main() {
   mempool_start_benchmark();
   if (core_id == 0) {
     // Each FFT is folded over 4 memory rows
-    // Each memory row is 2 * N_BANKS samples
+    // Each memory row is 2 * NUM_BANKS samples
     __atomic_store_n(&arrival_index, 0, __ATOMIC_RELAXED);
     dma_memcpy_blocking(l1_pFFT_Src, l2_pFFT_Src,
                         (N_RX * N_SC) * sizeof(int32_t));
@@ -78,7 +77,7 @@ int main() {
                           dim_M * dim_N * sizeof(int32_t));
     }
     for (uint32_t i = 0; i < N_FFTs_COL; i++) {
-      dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * N_BANKS),
+      dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * NUM_BANKS),
                           l2_twiddleCoef_f16, 3 * (N_SC / 4) * sizeof(int32_t));
     }
   }
@@ -114,7 +113,7 @@ int main() {
     dma_memcpy_blocking(l2_pBF_Dst, l1_pFFT_Dst,
                         (N_RX * N_SC) * sizeof(int32_t));
     for (uint32_t i = 0; i < N_FFTs_COL; i++) {
-      dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * N_BANKS),
+      dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * NUM_BANKS),
                           l2_twiddleCoef_f16, 3 * (N_SC / 4) * sizeof(int32_t));
     }
     __atomic_store_n(&arrival_index, 0, __ATOMIC_RELAXED);
diff --git a/software/kernels/baremetal/mempool_chest_q16.h b/software/kernels/baremetal/mempool_chest_q16.h
index b4d90adff..6e735bbe7 100644
--- a/software/kernels/baremetal/mempool_chest_q16.h
+++ b/software/kernels/baremetal/mempool_chest_q16.h
@@ -6,7 +6,7 @@
 
 #pragma once
 #include "builtins_v2.h"
-#define __MUL
+#define __MUL // Multiplication by pilot instead of division.
 
 /* a[i] = ar[i] + i * ai[j]
    out[i][j] = a[i] / c[j]
diff --git a/software/kernels/baremetal/mempool_cmatmul_f16.h b/software/kernels/baremetal/mempool_cmatmul_f16.h
index 12645c454..f37c197d8 100644
--- a/software/kernels/baremetal/mempool_cmatmul_f16.h
+++ b/software/kernels/baremetal/mempool_cmatmul_f16.h
@@ -13,10 +13,8 @@
 
 #pragma once
 #include "builtins_v2.h"
-// Use complex dotp in a single offload
-#define __CDOTP
-// Shift cores startpoint over rows of matrix A
-#define __SHIFT_A
+#define __CDOTP   // Use complex dotp in a single offload
+#define __SHIFT_A // Shift cores startpoint over rows of matrix A
 
 /******************************************************************************
  __        ___     _            _                   ____        _
diff --git a/software/kernels/baremetal/mempool_cmatmul_q16.h b/software/kernels/baremetal/mempool_cmatmul_q16.h
index aa6a71b6c..78ed04b31 100644
--- a/software/kernels/baremetal/mempool_cmatmul_q16.h
+++ b/software/kernels/baremetal/mempool_cmatmul_q16.h
@@ -13,8 +13,7 @@
 
 #pragma once
 #include "builtins_v2.h"
-// Shift cores startpoint over rows of matrix A
-#define __SHIFT_A
+#define __SHIFT_A // Shift cores startpoint over rows of matrix A
 
 #define CMATMUL_1x1_LOOP                                                       \
   v2s sum = {0, 0};                                                            \
diff --git a/software/kernels/baremetal/mempool_dotp_f16.h b/software/kernels/baremetal/mempool_dotp_f16.h
index 791b7c68e..e8083cfcf 100644
--- a/software/kernels/baremetal/mempool_dotp_f16.h
+++ b/software/kernels/baremetal/mempool_dotp_f16.h
@@ -7,6 +7,16 @@
 #pragma once
 #include "builtins_v2.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE_CORE_REDUCTION: Reduction with a single-core.
+BINARY_REDUCTION: Reduction with binary tree.
+*/
+
+#define SINGLE_CORE_REDUCTION
+
 #define DOTPF16VEC_UNROLLED4_LOOP                                              \
   {                                                                            \
     a01 = (*(v2h *)&in_a[i]);                                                  \
diff --git a/software/kernels/baremetal/mempool_dotp_f32.h b/software/kernels/baremetal/mempool_dotp_f32.h
index 290b96d59..13e4ff9e5 100644
--- a/software/kernels/baremetal/mempool_dotp_f32.h
+++ b/software/kernels/baremetal/mempool_dotp_f32.h
@@ -4,6 +4,16 @@
 
 // Author: Marco Bertuletti, ETH Zurich
 
+/*
+======================
+Parameters and defines
+
+SINGLE_CORE_REDUCTION: Reduction with a single-core.
+BINARY_REDUCTION: Reduction with binary tree.
+*/
+
+#define SINGLE_CORE_REDUCTION
+
 #define DOTPF32_UNROLLED4_LOOP                                                 \
   {                                                                            \
     a0 = in_a[i];                                                              \
diff --git a/software/kernels/baremetal/mempool_dotp_i32.h b/software/kernels/baremetal/mempool_dotp_i32.h
index 4b80e92ed..3f8320b91 100644
--- a/software/kernels/baremetal/mempool_dotp_i32.h
+++ b/software/kernels/baremetal/mempool_dotp_i32.h
@@ -4,6 +4,18 @@
 
 // Author: Marco Bertuletti, ETH Zurich
 
+/*
+======================
+Parameters and defines
+
+SINGLE_CORE_REDUCTION: Reduction with a single-core.
+BINARY_REDUCTION: Reduction with binary tree.
+ATOMIC_REDUCTION: Reduction with atomics.
+LOG_BARRIERS: Use binary reduction
+*/
+
+#define SINGLE_CORE_REDUCTION
+
 #define DOTPI32_UNROLLED4_LOOP                                                 \
   {                                                                            \
     a0 = in_a[i];                                                              \
diff --git a/software/kernels/baremetal/mempool_matmul_f32.h b/software/kernels/baremetal/mempool_matmul_f32.h
index 8879fa52d..c6b669bcc 100644
--- a/software/kernels/baremetal/mempool_matmul_f32.h
+++ b/software/kernels/baremetal/mempool_matmul_f32.h
@@ -13,6 +13,8 @@
 
 #pragma once
 #include "builtins_v2.h"
+// The 4x4 matmul is executed with asm_volatile statement
+#define ASM
 
 void matmul_2x2_single_f32(float const *__restrict__ A,
                            float const *__restrict__ B, float *__restrict__ C,
diff --git a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h
index 3ce36f3b6..12b2320e5 100644
--- a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h
+++ b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h
@@ -48,7 +48,7 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut,
 // STORE INDEXES
 #if defined(FOLDED) || defined(SCHEDULED)
   uint32_t n2_store = n2 >> 2U;
-  i0_store = (i0 % n2_store) + (i0 / n2_store) * N_BANKS;
+  i0_store = (i0 % n2_store) + (i0 / n2_store) * NUM_BANKS;
   i1_store = i0_store + n2_store;
   i2_store = i1_store + n2_store;
   i3_store = i2_store + n2_store;
@@ -158,9 +158,9 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut,
   /*  index calculation for the input as, */
   /*  pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 +
    * 3fftLen/4] */
-  i1 = i0 + N_BANKS;
-  i2 = i1 + N_BANKS;
-  i3 = i2 + N_BANKS;
+  i1 = i0 + NUM_BANKS;
+  i2 = i1 + NUM_BANKS;
+  i3 = i2 + NUM_BANKS;
 #else
   /*  index calculation for the input as, */
   /*  pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 +
@@ -173,7 +173,7 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut,
 #if defined(FOLDED) || defined(SCHEDULED)
   uint32_t n2_store = n2 >> 2U;
   i0_store =
-      (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS;
+      (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * NUM_BANKS;
   i1_store = i0_store + n2_store;
   i2_store = i1_store + n2_store;
   i3_store = i2_store + n2_store;
@@ -278,9 +278,9 @@ static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut,
   /*  index calculation for the input as, */
   /*  pIn[i0 + 0], pIn[i0 + fftLen/4],
       pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
-  i1 = i0 + N_BANKS;
-  i2 = i1 + N_BANKS;
-  i3 = i2 + N_BANKS;
+  i1 = i0 + NUM_BANKS;
+  i2 = i1 + NUM_BANKS;
+  i3 = i2 + NUM_BANKS;
 #else
   /*  index calculation for the input as, */
   /*  pIn[i0 + 0], pIn[i0 + fftLen/4],
diff --git a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_q16.h b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_q16.h
index a0c1a7791..5db6be1cf 100644
--- a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_q16.h
+++ b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_q16.h
@@ -50,7 +50,7 @@ static inline void radix4_butterfly_first(int16_t *pIn, int16_t *pOut,
 // STORE INDEXES
 #if defined(FOLDED) || defined(SCHEDULED)
   uint32_t n2_store = n2 >> 2U;
-  i0_store = (i0 % n2_store) + (i0 / n2_store) * N_BANKS;
+  i0_store = (i0 % n2_store) + (i0 / n2_store) * NUM_BANKS;
   i1_store = i0_store + n2_store;
   i2_store = i1_store + n2_store;
   i3_store = i2_store + n2_store;
@@ -227,9 +227,9 @@ static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut,
   /*  index calculation for the input as, */
   /*  pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 +
    * 3fftLen/4] */
-  i1 = i0 + N_BANKS;
-  i2 = i1 + N_BANKS;
-  i3 = i2 + N_BANKS;
+  i1 = i0 + NUM_BANKS;
+  i2 = i1 + NUM_BANKS;
+  i3 = i2 + NUM_BANKS;
 #else
   /*  index calculation for the input as, */
   /*  pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 +
@@ -242,7 +242,7 @@ static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut,
 #if defined(FOLDED) || defined(SCHEDULED)
   uint32_t n2_store = n2 >> 2U;
   i0_store =
-      (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS;
+      (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * NUM_BANKS;
   i1_store = i0_store + n2_store;
   i2_store = i1_store + n2_store;
   i3_store = i2_store + n2_store;
@@ -403,9 +403,9 @@ static inline void radix4_butterfly_last(int16_t *pIn, int16_t *pOut,
   /*  index calculation for the input as, */
   /*  pIn[i0 + 0], pIn[i0 + fftLen/4],
       pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
-  i1 = i0 + N_BANKS;
-  i2 = i1 + N_BANKS;
-  i3 = i2 + N_BANKS;
+  i1 = i0 + NUM_BANKS;
+  i2 = i1 + NUM_BANKS;
+  i3 = i2 + NUM_BANKS;
 #else
   /*  index calculation for the input as, */
   /*  pIn[i0 + 0], pIn[i0 + fftLen/4],
diff --git a/software/kernels/baremetal/mempool_radix4_cfft_f16p.h b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h
index c6b4acf6b..90a3fd093 100644
--- a/software/kernels/baremetal/mempool_radix4_cfft_f16p.h
+++ b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h
@@ -5,7 +5,6 @@
 // Author: Marco Bertuletti, ETH Zurich
 
 #pragma once
-#define BITREVERSETABLE
 #include "builtins_v2.h"
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 
@@ -33,19 +32,19 @@
 #ifdef FOLDED_TWIDDLES
 #define LOAD_STORE_TWIDDLEFACT                                                 \
   CoSi1 = *(v2h *)&pCoef_src[2U * ic];                                         \
-  CoSi2 = *(v2h *)&pCoef_src[2U * (ic + 1 * N_BANKS)];                         \
-  CoSi3 = *(v2h *)&pCoef_src[2U * (ic + 2 * N_BANKS)];                         \
+  CoSi2 = *(v2h *)&pCoef_src[2U * (ic + 1 * NUM_BANKS)];                       \
+  CoSi3 = *(v2h *)&pCoef_src[2U * (ic + 2 * NUM_BANKS)];                       \
   if (ic % 4 == 0) {                                                           \
     *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi1;                             \
     *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi1;              \
     *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi1;              \
     *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi1;              \
-    ic_store += N_BANKS;                                                       \
+    ic_store += NUM_BANKS;                                                     \
     *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi2;                             \
     *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi2;              \
     *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi2;              \
     *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi2;              \
-    ic_store += N_BANKS;                                                       \
+    ic_store += NUM_BANKS;                                                     \
     *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi3;                             \
     *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi3;              \
     *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi3;              \
@@ -315,8 +314,8 @@ void mempool_radix4_cfft_f16p_scheduler(
     LOAD_STORE_TWIDDLEFACT;
     SHUFFLE_TWIDDLEFACT;
     for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-      pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen;
-      pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+      pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * fftLen;
+      pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4);
       radix4_butterfly_first(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
                              C3);
     }
@@ -345,8 +344,8 @@ void mempool_radix4_cfft_f16p_scheduler(
       SHUFFLE_TWIDDLEFACT;
 
       for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-        pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
-        pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+        pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4);
+        pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4);
         radix4_butterfly_middle(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
                                 C3);
       }
@@ -370,8 +369,8 @@ void mempool_radix4_cfft_f16p_scheduler(
       uint32_t col_shift = fftLen / 4;
 #endif
 
-      pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
-      pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * col_shift;
+      pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4);
+      pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * col_shift;
       radix4_butterfly_last(pIn, pOut, i0);
     }
   }
@@ -416,7 +415,7 @@ void mempool_radix4_cfft_f16p_scheduler(
                    : [s2] "r"(s2)
                    :);
       for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
-        uint16_t *ptr = (uint16_t *)(pIn + idx_row * (N_BANKS * 8));
+        uint16_t *ptr = (uint16_t *)(pIn + idx_row * (NUM_BANKS * 8));
         // Load at address a
         tmpa1 = *(uint32_t *)&ptr[a1];
         tmpa2 = *(uint32_t *)&ptr[a2];
@@ -465,12 +464,12 @@ void mempool_radix4_cfft_f16p_scheduler(
         idx3 = idx3 >> 1U;
       }
       idx0 = ic / 4;
-      idx1 = ic / 4 + N_BANKS;
-      idx2 = ic / 4 + 2 * N_BANKS;
-      idx3 = ic / 4 + 3 * N_BANKS;
+      idx1 = ic / 4 + NUM_BANKS;
+      idx2 = ic / 4 + 2 * NUM_BANKS;
+      idx3 = ic / 4 + 3 * NUM_BANKS;
       for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-        ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (N_BANKS * 8);
-        ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (N_BANKS * 8);
+        ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (NUM_BANKS * 8);
+        ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (NUM_BANKS * 8);
         *((uint32_t *)&ptr2[2 * idx_result0]) = *((uint32_t *)&ptr1[2 * idx0]);
         *((uint32_t *)&ptr2[2 * idx_result1]) = *((uint32_t *)&ptr1[2 * idx1]);
         *((uint32_t *)&ptr2[2 * idx_result2]) = *((uint32_t *)&ptr1[2 * idx2]);
diff --git a/software/kernels/baremetal/mempool_radix4_cfft_q16p.h b/software/kernels/baremetal/mempool_radix4_cfft_q16p.h
index e91602866..ef6122a00 100644
--- a/software/kernels/baremetal/mempool_radix4_cfft_q16p.h
+++ b/software/kernels/baremetal/mempool_radix4_cfft_q16p.h
@@ -45,19 +45,19 @@
 
 #define LOAD_STORE_TWIDDLEFACT                                                 \
   CoSi1 = *(v2s *)&pCoef_src[2U * ic];                                         \
-  CoSi2 = *(v2s *)&pCoef_src[2U * (ic + 1 * N_BANKS)];                         \
-  CoSi3 = *(v2s *)&pCoef_src[2U * (ic + 2 * N_BANKS)];                         \
+  CoSi2 = *(v2s *)&pCoef_src[2U * (ic + 1 * NUM_BANKS)];                       \
+  CoSi3 = *(v2s *)&pCoef_src[2U * (ic + 2 * NUM_BANKS)];                       \
   if (ic % 4 == 0) {                                                           \
     *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi1;                             \
     *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi1;              \
     *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi1;              \
     *((v2s *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi1;              \
-    ic_store += N_BANKS;                                                       \
+    ic_store += NUM_BANKS;                                                     \
     *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi2;                             \
     *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi2;              \
     *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi2;              \
     *((v2s *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi2;              \
-    ic_store += N_BANKS;                                                       \
+    ic_store += NUM_BANKS;                                                     \
     *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi3;                             \
     *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi3;              \
     *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi3;              \
@@ -226,16 +226,16 @@ static inline void fold_radix4(int16_t *pSrc16, uint32_t fftLen,
     i1 = i0 + n2;
     i2 = i1 + n2;
     i3 = i2 + n2;
-    i1_store = i0 + N_BANKS;
-    i2_store = i1_store + N_BANKS;
-    i3_store = i2_store + N_BANKS;
+    i1_store = i0 + NUM_BANKS;
+    i2_store = i1_store + NUM_BANKS;
+    i3_store = i2_store + NUM_BANKS;
     for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-      A = *(v2s *)&pSrc16[i1 * 2U + idx_row * (8 * N_BANKS)];
-      B = *(v2s *)&pSrc16[i2 * 2U + idx_row * (8 * N_BANKS)];
-      C = *(v2s *)&pSrc16[i3 * 2U + idx_row * (8 * N_BANKS)];
-      *(v2s *)&pSrc16[i1_store * 2U + idx_row * (8 * N_BANKS)] = A;
-      *(v2s *)&pSrc16[i2_store * 2U + idx_row * (8 * N_BANKS)] = B;
-      *(v2s *)&pSrc16[i3_store * 2U + idx_row * (8 * N_BANKS)] = C;
+      A = *(v2s *)&pSrc16[i1 * 2U + idx_row * (8 * NUM_BANKS)];
+      B = *(v2s *)&pSrc16[i2 * 2U + idx_row * (8 * NUM_BANKS)];
+      C = *(v2s *)&pSrc16[i3 * 2U + idx_row * (8 * NUM_BANKS)];
+      *(v2s *)&pSrc16[i1_store * 2U + idx_row * (8 * NUM_BANKS)] = A;
+      *(v2s *)&pSrc16[i2_store * 2U + idx_row * (8 * NUM_BANKS)] = B;
+      *(v2s *)&pSrc16[i3_store * 2U + idx_row * (8 * NUM_BANKS)] = C;
     }
   }
   mempool_log_partial_barrier(2, absolute_core_id, nPE);
@@ -426,8 +426,8 @@ void mempool_radix4_cfft_q16p_scheduler(
     LOAD_STORE_TWIDDLEFACT;
     SHUFFLE_TWIDDLEFACT;
     for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-      pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen;
-      pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+      pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * fftLen;
+      pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4);
       radix4_butterfly_first(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
                              C3);
     }
@@ -460,8 +460,8 @@ void mempool_radix4_cfft_q16p_scheduler(
       SHUFFLE_TWIDDLEFACT;
 
       for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-        pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
-        pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+        pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4);
+        pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4);
         radix4_butterfly_middle(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
                                 C3);
       }
@@ -489,8 +489,8 @@ void mempool_radix4_cfft_q16p_scheduler(
       uint32_t col_shift = fftLen / 4;
 #endif
 
-      pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
-      pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * col_shift;
+      pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4);
+      pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * col_shift;
       radix4_butterfly_last(pIn, pOut, i0);
     }
   }
@@ -535,7 +535,7 @@ void mempool_radix4_cfft_q16p_scheduler(
                    : [s2] "r"(s2)
                    :);
       for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
-        uint16_t *ptr = (uint16_t *)(pIn + idx_row * (N_BANKS * 8));
+        uint16_t *ptr = (uint16_t *)(pIn + idx_row * (NUM_BANKS * 8));
         // Load at address a
         tmpa1 = *(uint32_t *)&ptr[a1];
         tmpa2 = *(uint32_t *)&ptr[a2];
@@ -584,12 +584,12 @@ void mempool_radix4_cfft_q16p_scheduler(
         idx3 = idx3 >> 1U;
       }
       idx0 = ic / 4;
-      idx1 = ic / 4 + N_BANKS;
-      idx2 = ic / 4 + 2 * N_BANKS;
-      idx3 = ic / 4 + 3 * N_BANKS;
+      idx1 = ic / 4 + NUM_BANKS;
+      idx2 = ic / 4 + 2 * NUM_BANKS;
+      idx3 = ic / 4 + 3 * NUM_BANKS;
       for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-        ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (N_BANKS * 8);
-        ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (N_BANKS * 8);
+        ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (NUM_BANKS * 8);
+        ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (NUM_BANKS * 8);
         *((uint32_t *)&ptr2[2 * idx_result0]) = *((uint32_t *)&ptr1[2 * idx0]);
         *((uint32_t *)&ptr2[2 * idx_result1]) = *((uint32_t *)&ptr1[2 * idx1]);
         *((uint32_t *)&ptr2[2 * idx_result2]) = *((uint32_t *)&ptr1[2 * idx2]);
diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk
index 2602804cc..94f822ddc 100644
--- a/software/runtime/runtime.mk
+++ b/software/runtime/runtime.mk
@@ -27,7 +27,7 @@ DATA_DIR           ?= $(abspath $(ROOT_DIR)/../data)
 COMPILER      ?= gcc
 XPULPIMG      ?= $(xpulpimg)
 ZFINX         ?= $(zfinx)
-XDIVSQRT	  ?= $(xDivSqrt)
+XDIVSQRT	    ?= $(xDivSqrt)
 
 RISCV_XLEN    ?= 32
 
@@ -92,6 +92,7 @@ DEFINES += -DNUM_CORES=$(num_cores)
 DEFINES += -DNUM_GROUPS=$(num_groups)
 DEFINES += -DNUM_CORES_PER_TILE=$(num_cores_per_tile)
 DEFINES += -DBANKING_FACTOR=$(banking_factor)
+DEFINES += -DNUM_BANKS=$(shell awk 'BEGIN{print $(banking_factor)*$(num_cores)}')
 DEFINES += -DNUM_CORES_PER_GROUP=$(shell awk 'BEGIN{print $(num_cores)/$(num_groups)}')
 DEFINES += -DNUM_TILES_PER_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_cores_per_tile)}')
 DEFINES += -DLOG2_NUM_CORES_PER_TILE=$(shell awk 'BEGIN{print log($(num_cores_per_tile))/log(2)}')