[software] Add explanation for the use of defines

pulp-platform · Dec 19, 2024 · dea872f · dea872f
1 parent 0f1de6f
commit dea872f
Show file tree

Hide file tree

Showing 45 changed files with 452 additions and 363 deletions.
diff --git a/software/apps/baremetal/Makefile b/software/apps/baremetal/Makefile
@@ -20,18 +20,13 @@ APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find $(APPS_DIR) -name "main.c
 BINARIES := $(addprefix $(BIN_DIR)/,$(APPS))
 ALL := $(APPS)
 
-FP_APPS := axpy_f16 axpy_f32
-FP_APPS += cfft_radix4_f16 chest_f16 cholesky_f16
-FP_APPS += cmatmul_f16 matmul_f16 matmul_f32
-FP_APPS += dotp_f16 dotp_f32
-FP_APPS += mimo_mmse_f32 mimo_mmse_f16 mimo_mmse_f8 ofdm_f16
-
-I_APPS := synth_i32
-I_APPS += cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32
-I_APPS += cmatmul_q16 mimo_mmse_q16
-
-ALL_GCC := $(filter-out $(FP_APPS), $(ALL))
-ALL_LLVM := $(filter-out $(I_APPS), $(ALL))
+FP_SUFFIXES := f16 f32 f8
+I_SUFFIXES := q16 q32 i16 i32 i8
+I_APPS := $(foreach suf, $(FP_SUFFIXES), $(filter %_$(suf), $(ALL)))
+FP_APPS := $(foreach suf, $(I_SUFFIXES), $(filter %_$(suf), $(ALL)))
+# Filter out applications
+ALL_GCC := $(filter-out $(I_APPS), $(ALL))
+ALL_LLVM := $(filter-out $(FP_APPS), $(ALL))
 
 # Make all applications
 all: $(ALL_GCC)

diff --git a/software/apps/baremetal/axpy_f16/main.c b/software/apps/baremetal/axpy_f16/main.c
@@ -15,7 +15,6 @@
 #include "synchronization.h"
 
 #include "data_axpy_f16.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // Vectors for kernel computation
 __fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));

diff --git a/software/apps/baremetal/axpy_f32/main.c b/software/apps/baremetal/axpy_f32/main.c
@@ -15,7 +15,6 @@
 #include "synchronization.h"
 
 #include "data_axpy_f32.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // Vectors for kernel computation
 float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));

diff --git a/software/apps/baremetal/cfft_radix2_q16/main.c b/software/apps/baremetal/cfft_radix2_q16/main.c
@@ -19,7 +19,6 @@
 #include "synchronization.h"
 
 #include "data_cfft_radix2_q16.h"
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
 
 /* CFFT mempool libraries */
 #include "baremetal/mempool_cfft_q16_bitreversal.h"

diff --git a/software/apps/baremetal/cfft_radix4_f16/main.c b/software/apps/baremetal/cfft_radix4_f16/main.c
@@ -19,25 +19,30 @@
 
 /* CFFT data libraries */
 #include "data_cfft_radix4_f16.h"
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
+#define MAX_COL (NUM_BANKS / (N_CSAMPLES / 4))
 
-/* CHOOSE ONE */
-#define PARALLEL // Parallel FFT not "memory-aware".
-// #define FOLDED // Parallel FFT with "memory-aware" load/store.
-//#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
+/*
+======================
+Parameters and defines
 
-// Bitreversal index from table.
+PARALLEL: When defined runs parallel FFT.
+FOLDED: When defined runs parallel FFT with folded inputs in memory.
+SCHEDULED: When defined runs multiple parallel folded-inputs FFTs.
+N_FFTs_ROW: When the FFT is scheduled defines the number of FFTs run sequntially
+by each core N_FFTs_COL:
+
+BITREVERSETABLE: When defined bitreversal indeces are fetched from a table, else
+they are computed by cores. FOLDED_TWIDDLES: When FOLDED is defined it can be
+defined to also fold the twiddle factors in memory.
+*/
+
+#define PARALLEL
 #define BITREVERSETABLE
-// Also the twiddles have "memory-aware" load/stores.
-// #define FOLDED_TWIDDLES
 
-// Independent FFTs scheduled on one row (default 1).
-#define N_FFTs_ROW 1
-// Independent FFTs scheduled on columns (default 1).
-#define N_FFTs_COL 1
+#define N_FFTs_ROW (1)
+#define N_FFTs_COL (1)
 #if (N_FFTs_COL > MAX_COL)
-#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
+#error Parallelization not supporting N_FFTs_COL > [NUM_BANKS / (N_CSAMPLES / 4)]
 #endif
 
 #include "baremetal/mempool_cfft_q16_bitreversal.h"
@@ -59,16 +64,16 @@ uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
 #endif
 
 #if (defined(SCHEDULED) || defined(FOLDED))
-__fp16 l1_pSrc[N_FFTs_ROW * 8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-__fp16 l1_pDst[N_FFTs_ROW * 8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-__fp16 l1_twiddleCoef_f16_src[8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-__fp16 l1_twiddleCoef_f16_dst[8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+__fp16 l1_pSrc[N_FFTs_ROW * 8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+__fp16 l1_pDst[N_FFTs_ROW * 8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_src[8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_dst[8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 #endif
 
 int main() {
@@ -96,7 +101,7 @@ int main() {
   if (core_id == 0) {
     for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
       for (uint32_t i = 0; i < N_FFTs_COL; i++) {
-        dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS),
+        dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * NUM_BANKS),
                             l2_pSrc, N_CSAMPLES * sizeof(int32_t));
       }
     }
@@ -113,9 +118,11 @@ int main() {
     for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
       *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL)] =
           *(v2h *)&l2_twiddleCoef_f16[2 * i];
-      *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] =
+      *(v2h *)&l1_twiddleCoef_f16_src[2 *
+                                      (i + j * N_WORDS_COL + 1 * NUM_BANKS)] =
           *(v2h *)&l2_twiddleCoef_f16[2 * (i * 2U)];
-      *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] =
+      *(v2h *)&l1_twiddleCoef_f16_src[2 *
+                                      (i + j * N_WORDS_COL + 2 * NUM_BANKS)] =
           *(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)];
     }
   }

diff --git a/software/apps/baremetal/cfft_radix4_q16/main.c b/software/apps/baremetal/cfft_radix4_q16/main.c
@@ -19,23 +19,30 @@
 
 /* CFFT data libraries */
 #include "data_cfft_radix4_q16.h"
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
+#define MAX_COL (NUM_BANKS / (N_CSAMPLES / 4))
 
-/* CHOOSE ONE */
-//#define SINGLE // Single core FFT.
-//#define PARALLEL // Parallel FFT not "memory-aware".
-//#define FOLDED // Parallel FFT with "memory-aware" load/store.
-#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
+/*
+======================
+Parameters and defines
+
+PARALLEL: When defined runs parallel FFT.
+FOLDED: When defined runs parallel FFT with folded inputs in memory.
+SCHEDULED: When defined runs multiple parallel folded-inputs FFTs.
+N_FFTs_ROW: When the FFT is scheduled defines the number of FFTs run sequntially
+by each core N_FFTs_COL:
+
+BITREVERSETABLE: When defined bitreversal indeces are fetched from a table, else
+they are computed by cores. FOLDED_TWIDDLES: When FOLDED is defined it can be
+defined to also fold the twiddle factors in memory.
+*/
 
-// Bitreversal index from table.
+#define PARALLEL
 #define BITREVERSETABLE
-// Independent FFTs scheduled on one row (default 1).
-#define N_FFTs_ROW 2
-// Independent FFTs scheduled on columns (default 1).
-#define N_FFTs_COL 2
+
+#define N_FFTs_ROW (1)
+#define N_FFTs_COL (1)
 #if (N_FFTs_COL > MAX_COL)
-#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
+#error Parallelization not supporting N_FFTs_COL > [NUM_BANKS / (N_CSAMPLES / 4)]
 #endif
 // Also the twiddles have "memory-aware" load/stores.
 #define FOLDED_TWIDDLES
@@ -60,16 +67,16 @@ uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
 #endif
 
 #if (defined(SCHEDULED) || defined(FOLDED))
-int16_t l1_pSrc[N_FFTs_ROW * 8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-int16_t l1_pDst[N_FFTs_ROW * 8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-int16_t l1_twiddleCoef_q16_src[8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
-int16_t l1_twiddleCoef_q16_dst[8 * N_BANKS]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+int16_t l1_pSrc[N_FFTs_ROW * 8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+int16_t l1_pDst[N_FFTs_ROW * 8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+int16_t l1_twiddleCoef_q16_src[8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
+int16_t l1_twiddleCoef_q16_dst[8 * NUM_BANKS]
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
-    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+    __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
 #endif
 
 int main() {
@@ -97,7 +104,7 @@ int main() {
   if (core_id == 0) {
     for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
       for (uint32_t i = 0; i < N_FFTs_COL; i++) {
-        dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS),
+        dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * NUM_BANKS),
                             l2_pSrc, N_CSAMPLES * sizeof(int32_t));
       }
     }
@@ -112,9 +119,11 @@ int main() {
     for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
       *(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL)] =
           *(v2s *)&l2_twiddleCoef_q16[2 * i];
-      *(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] =
+      *(v2s *)&l1_twiddleCoef_q16_src[2 *
+                                      (i + j * N_WORDS_COL + 1 * NUM_BANKS)] =
           *(v2s *)&l2_twiddleCoef_q16[2 * (i * 2U)];
-      *(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] =
+      *(v2s *)&l1_twiddleCoef_q16_src[2 *
+                                      (i + j * N_WORDS_COL + 2 * NUM_BANKS)] =
           *(v2s *)&l2_twiddleCoef_q16[2 * (i * 3U)];
     }
   }

diff --git a/software/apps/baremetal/chest_f16/main.c b/software/apps/baremetal/chest_f16/main.c
@@ -19,6 +19,14 @@
 #include "baremetal/mempool_chest_f16.h"
 #include "data_chest_f16.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core Channel Estimation.
+PARALLEL: When defined runs parallel Channel Estimation.
+*/
+
 //#define SINGLE
 #define PARALLEL
 

diff --git a/software/apps/baremetal/chest_q16/main.c b/software/apps/baremetal/chest_q16/main.c
@@ -19,6 +19,14 @@
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_chest_q16.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core Channel Estimation.
+PARALLEL: When defined runs parallel Channel Estimation.
+*/
+
 #define PARALLEL
 
 int16_t l1_PilotTX[2 * N_TX * N_SAMPLES]

diff --git a/software/apps/baremetal/cholesky_f16/main.c b/software/apps/baremetal/cholesky_f16/main.c
@@ -17,6 +17,15 @@
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_cholesky_f16s.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core Cholesky Decomposition.
+PARALLEL: When defined runs parallel Cholesky Decomposition.
+FOLDED: When defined 1 intermediate results are folded in memory.
+*/
+
 #define SINGLE
 #define FOLDED (0)
 

diff --git a/software/apps/baremetal/cholesky_q32/main.c b/software/apps/baremetal/cholesky_q32/main.c
@@ -11,7 +11,6 @@
 #include "synchronization.h"
 
 #define HALF (1023)
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
 #define FIX_DIV(a, b) ((int32_t)((a << FIXED_POINT) / b))
 #define FIX_MUL(a, b) ((int32_t)((a * b + HALF) >> FIXED_POINT))
 #define ABS(a) (a > 0 ? a : -a)
@@ -31,18 +30,19 @@
 #define N_COL 1
 #define N_ROW 1
 int32_t l1_A[matrix_N * matrix_N]
-    __attribute__((aligned(N_BANKS), section(".l1")));
+    __attribute__((aligned(NUM_BANKS), section(".l1")));
 int32_t l1_L[matrix_N * matrix_N]
-    __attribute__((aligned(N_BANKS), section(".l1")));
-int32_t l1_y[matrix_N] __attribute__((aligned(N_BANKS), section(".l1")));
+    __attribute__((aligned(NUM_BANKS), section(".l1")));
+int32_t l1_y[matrix_N] __attribute__((aligned(NUM_BANKS), section(".l1")));
 #else
-int32_t l1_AA[matrix_N * N_BANKS]
-    __attribute__((aligned(N_BANKS), section(".l1_prio")));
-int32_t l1_LL[N_ROW * matrix_N * N_BANKS]
-    __attribute__((aligned(N_BANKS), section(".l1_prio")));
-int32_t l1_LR[N_ROW * matrix_N * N_BANKS]
-    __attribute__((aligned(N_BANKS), section(".l1_prio")));
-int32_t l1_yy[N_BANKS] __attribute__((aligned(N_BANKS), section(".l1_prio")));
+int32_t l1_AA[matrix_N * NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+int32_t l1_LL[N_ROW * matrix_N * NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+int32_t l1_LR[N_ROW * matrix_N * NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+int32_t l1_yy[NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 #endif
 
 int main() {
@@ -58,11 +58,12 @@ int main() {
       for (uint32_t idx_col = 0; idx_col < N_COL; idx_col++) {
         l1_yy[idx_col * matrix_N + i] = l2_y[i];
         for (uint32_t j = 0; j < matrix_N; j++) {
-          l1_AA[idx_col * matrix_N + i * N_BANKS + j] = l2_A[i * matrix_N + j];
+          l1_AA[idx_col * matrix_N + i * NUM_BANKS + j] =
+              l2_A[i * matrix_N + j];
         }
       }
     }
-    for (uint32_t i = 0; i < N_ROW * matrix_N * N_BANKS; i++) {
+    for (uint32_t i = 0; i < N_ROW * matrix_N * NUM_BANKS; i++) {
       l1_LL[i] = 0;
       l1_LR[i] = 0;
     }

diff --git a/software/apps/baremetal/cmatmul_f16/main.c b/software/apps/baremetal/cmatmul_f16/main.c
@@ -19,7 +19,18 @@
 
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_cmatmul_f16.h"
-#define PARALLEL_4x4
+
+/*
+======================
+Parameters and defines
+
+SINGLE_2x2: Single-core matmul on 2x2 tiles.
+PARALLEL_2x2: Parallel matmul on 2x2 C-tiles.
+PARALLEL_2x4: Parallel matmul on 4x4 C-tiles.
+PARALLEL_4x4: Parallel matmul on 4x4 C-tiles.
+PARALLEL_4x4_COPIES_A: Parallel matmul on 4x4 C-tiles, compies of A in memory to
+avoid banking conflicts.
+*/
 
 #if defined(PARALLEL_4x4_COPIES_A)
 __fp16 matrix_a[2 * (BANKING_FACTOR * NUM_CORES)]
@@ -51,7 +62,7 @@ int main() {
   // Wait at barrier until everyone is ready
   mempool_barrier(num_cores);
 
-#if defined(SINGLE_CORE)
+#if defined(SINGLE_2x2)
   // Execute function to test.
   if (core_id == 0) {
     mempool_start_benchmark();

diff --git a/software/apps/baremetal/cmatmul_q16/main.c b/software/apps/baremetal/cmatmul_q16/main.c
@@ -16,6 +16,14 @@
 #include "baremetal/mempool_cmatmul_q16.h"
 #include "data_cmatmul_q16.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core matmul.
+PARALLEL: When defined runs parallel matmul.
+*/
+
 #define PARALLEL
 #define dim_M (matrix_M)
 #define dim_N (matrix_N)