[software] Add explanation for the use of defines

pulp-platform · Dec 19, 2024 · ef75e96 · ef75e96
1 parent 0f1de6f
commit ef75e96
Show file tree

Hide file tree

Showing 31 changed files with 237 additions and 142 deletions.
diff --git a/software/apps/baremetal/Makefile b/software/apps/baremetal/Makefile
@@ -20,18 +20,13 @@ APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find $(APPS_DIR) -name "main.c
 BINARIES := $(addprefix $(BIN_DIR)/,$(APPS))
 ALL := $(APPS)
 
-FP_APPS := axpy_f16 axpy_f32
-FP_APPS += cfft_radix4_f16 chest_f16 cholesky_f16
-FP_APPS += cmatmul_f16 matmul_f16 matmul_f32
-FP_APPS += dotp_f16 dotp_f32
-FP_APPS += mimo_mmse_f32 mimo_mmse_f16 mimo_mmse_f8 ofdm_f16
-
-I_APPS := synth_i32
-I_APPS += cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32
-I_APPS += cmatmul_q16 mimo_mmse_q16
-
-ALL_GCC := $(filter-out $(FP_APPS), $(ALL))
-ALL_LLVM := $(filter-out $(I_APPS), $(ALL))
+FP_SUFFIXES := f16 f32 f8
+I_SUFFIXES := q16 q32 i16 i32 i8
+I_APPS := $(foreach suf, $(FP_SUFFIXES), $(filter %_$(suf), $(ALL)))
+FP_APPS := $(foreach suf, $(I_SUFFIXES), $(filter %_$(suf), $(ALL)))
+# Filter out applications
+ALL_GCC := $(filter-out $(I_APPS), $(ALL))
+ALL_LLVM := $(filter-out $(FP_APPS), $(ALL))
 
 # Make all applications
 all: $(ALL_GCC)

diff --git a/software/apps/baremetal/axpy_f16/main.c b/software/apps/baremetal/axpy_f16/main.c
@@ -15,7 +15,6 @@
 #include "synchronization.h"
 
 #include "data_axpy_f16.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // Vectors for kernel computation
 __fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));

diff --git a/software/apps/baremetal/axpy_f32/main.c b/software/apps/baremetal/axpy_f32/main.c
@@ -15,7 +15,6 @@
 #include "synchronization.h"
 
 #include "data_axpy_f32.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // Vectors for kernel computation
 float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));

diff --git a/software/apps/baremetal/cfft_radix4_f16/main.c b/software/apps/baremetal/cfft_radix4_f16/main.c
@@ -22,20 +22,26 @@
 #define N_BANKS (NUM_CORES * BANKING_FACTOR)
 #define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
 
-/* CHOOSE ONE */
-#define PARALLEL // Parallel FFT not "memory-aware".
-// #define FOLDED // Parallel FFT with "memory-aware" load/store.
-//#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
-
-// Bitreversal index from table.
+/*
+======================
+Parameters and defines
+
+PARALLEL: When defined runs parallel FFT.
+FOLDED: When defined runs parallel FFT with folded inputs in memory.
+SCHEDULED: When defined runs multiple parallel folded-inputs FFTs.
+N_FFTs_ROW: When the FFT is scheduled defines the number of FFTs run sequntially
+by each core N_FFTs_COL:
+
+BITREVERSETABLE: When defined bitreversal indeces are fetched from a table, else
+they are computed by cores. FOLDED_TWIDDLES: When FOLDED is defined it can be
+defined to also fold the twiddle factors in memory.
+*/
+
+#define PARALLEL
 #define BITREVERSETABLE
-// Also the twiddles have "memory-aware" load/stores.
-// #define FOLDED_TWIDDLES
 
-// Independent FFTs scheduled on one row (default 1).
-#define N_FFTs_ROW 1
-// Independent FFTs scheduled on columns (default 1).
-#define N_FFTs_COL 1
+#define N_FFTs_ROW (1)
+#define N_FFTs_COL (1)
 #if (N_FFTs_COL > MAX_COL)
 #error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
 #endif

diff --git a/software/apps/baremetal/cfft_radix4_q16/main.c b/software/apps/baremetal/cfft_radix4_q16/main.c
@@ -22,18 +22,26 @@
 #define N_BANKS (NUM_CORES * BANKING_FACTOR)
 #define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
 
-/* CHOOSE ONE */
-//#define SINGLE // Single core FFT.
-//#define PARALLEL // Parallel FFT not "memory-aware".
-//#define FOLDED // Parallel FFT with "memory-aware" load/store.
-#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
+/*
+======================
+Parameters and defines
+
+PARALLEL: When defined runs parallel FFT.
+FOLDED: When defined runs parallel FFT with folded inputs in memory.
+SCHEDULED: When defined runs multiple parallel folded-inputs FFTs.
+N_FFTs_ROW: When the FFT is scheduled defines the number of FFTs run sequntially
+by each core N_FFTs_COL:
+
+BITREVERSETABLE: When defined bitreversal indeces are fetched from a table, else
+they are computed by cores. FOLDED_TWIDDLES: When FOLDED is defined it can be
+defined to also fold the twiddle factors in memory.
+*/
 
-// Bitreversal index from table.
+#define PARALLEL
 #define BITREVERSETABLE
-// Independent FFTs scheduled on one row (default 1).
-#define N_FFTs_ROW 2
-// Independent FFTs scheduled on columns (default 1).
-#define N_FFTs_COL 2
+
+#define N_FFTs_ROW (1)
+#define N_FFTs_COL (1)
 #if (N_FFTs_COL > MAX_COL)
 #error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
 #endif

diff --git a/software/apps/baremetal/chest_f16/main.c b/software/apps/baremetal/chest_f16/main.c
@@ -19,6 +19,14 @@
 #include "baremetal/mempool_chest_f16.h"
 #include "data_chest_f16.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core Channel Estimation.
+PARALLEL: When defined runs parallel Channel Estimation.
+*/
+
 //#define SINGLE
 #define PARALLEL
 

diff --git a/software/apps/baremetal/chest_q16/main.c b/software/apps/baremetal/chest_q16/main.c
@@ -19,6 +19,14 @@
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_chest_q16.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core Channel Estimation.
+PARALLEL: When defined runs parallel Channel Estimation.
+*/
+
 #define PARALLEL
 
 int16_t l1_PilotTX[2 * N_TX * N_SAMPLES]

diff --git a/software/apps/baremetal/cholesky_f16/main.c b/software/apps/baremetal/cholesky_f16/main.c
@@ -17,6 +17,15 @@
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_cholesky_f16s.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core Cholesky Decomposition.
+PARALLEL: When defined runs parallel Cholesky Decomposition.
+FOLDED: When defined 1 intermediate results are folded in memory.
+*/
+
 #define SINGLE
 #define FOLDED (0)
 

diff --git a/software/apps/baremetal/cmatmul_f16/main.c b/software/apps/baremetal/cmatmul_f16/main.c
@@ -19,7 +19,18 @@
 
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_cmatmul_f16.h"
-#define PARALLEL_4x4
+
+/*
+======================
+Parameters and defines
+
+SINGLE_2x2: Single-core matmul on 2x2 tiles.
+PARALLEL_2x2: Parallel matmul on 2x2 C-tiles.
+PARALLEL_2x4: Parallel matmul on 4x4 C-tiles.
+PARALLEL_4x4: Parallel matmul on 4x4 C-tiles.
+PARALLEL_4x4_COPIES_A: Parallel matmul on 4x4 C-tiles, compies of A in memory to
+avoid banking conflicts.
+*/
 
 #if defined(PARALLEL_4x4_COPIES_A)
 __fp16 matrix_a[2 * (BANKING_FACTOR * NUM_CORES)]
@@ -51,7 +62,7 @@ int main() {
   // Wait at barrier until everyone is ready
   mempool_barrier(num_cores);
 
-#if defined(SINGLE_CORE)
+#if defined(SINGLE_2x2)
   // Execute function to test.
   if (core_id == 0) {
     mempool_start_benchmark();

diff --git a/software/apps/baremetal/cmatmul_q16/main.c b/software/apps/baremetal/cmatmul_q16/main.c
@@ -16,6 +16,14 @@
 #include "baremetal/mempool_cmatmul_q16.h"
 #include "data_cmatmul_q16.h"
 
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core matmul.
+PARALLEL: When defined runs parallel matmul.
+*/
+
 #define PARALLEL
 #define dim_M (matrix_M)
 #define dim_N (matrix_N)

diff --git a/software/apps/baremetal/dotp_f16/main.c b/software/apps/baremetal/dotp_f16/main.c
@@ -14,9 +14,6 @@
 #include "synchronization.h"
 
 #include "data_dotp_f16.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
-// #define SINGLE_CORE_REDUCTION
-#define BINARY_REDUCTION
 
 // Vectors for kernel computation
 __fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
@@ -47,18 +44,6 @@ int main() {
   }
   mempool_barrier(num_cores);
 
-  //  // SINGLE-CORE
-  //  time_init = mempool_get_timer();
-  //  dotp_f16s(l1_X, l1_Y, sum, array_N);
-  //  // dotp_f16s_unrolled4(l1_X, l1_Y, sum, array_N);
-  //  time_end = mempool_get_timer();
-
-  //  // PARALLEL
-  //  time_init = mempool_get_timer();
-  //  dotp_f16vecp_unrolled4(l1_X, l1_Y, sum, array_N, num_cores);
-  //  // dotp_f16p(l1_X, l1_Y, sum, array_N, num_cores);
-  //  time_end = mempool_get_timer();
-
   // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
   dotp_f16vecp_local_unrolled4(l1_X, l1_Y, sum, array_N);

diff --git a/software/apps/baremetal/dotp_f32/main.c b/software/apps/baremetal/dotp_f32/main.c
@@ -15,9 +15,6 @@
 #include "synchronization.h"
 
 #include "data_dotp_f32.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
-// #define SINGLE_CORE_REDUCTION
-#define BINARY_REDUCTION
 
 // Vectors for kernel computation
 float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
@@ -47,16 +44,6 @@ int main() {
   }
   mempool_barrier(num_cores);
 
-  //    // SINGLE-CORE
-  //    time_init = mempool_get_timer();
-  //    dotp_f32s_unrolled4(l1_A, l1_B, sum, array_N);
-  //    time_end = mempool_get_timer();
-
-  //   // PARALLEL
-  //   time_init = mempool_get_timer();
-  //   dotp_f32p(l1_A, l1_B, sum, array_N, num_cores);
-  //   time_end = mempool_get_timer();
-
   // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
   dotp_f32p_local_unrolled4(l1_X, l1_Y, sum, array_N);

diff --git a/software/apps/baremetal/dotp_i32/main.c b/software/apps/baremetal/dotp_i32/main.c
@@ -15,11 +15,6 @@
 #include "synchronization.h"
 
 #include "data_dotp_i32.h"
-#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
-#define LOG_BARRIERS
-// #define ATOMIC_REDUCTION
-// #define SINGLE_CORE_REDUCTION
-#define BINARY_REDUCTION
 
 // Vectors for kernel computation
 int32_t l1_X[array_N] __attribute__((aligned(array_N), section(".l1_prio")));
@@ -49,16 +44,6 @@ int main() {
   }
   mempool_barrier(num_cores);
 
-  //  // SINGLE-CORE
-  //  time_init = mempool_get_timer();
-  //  dotp_i32s_unrolled4(l1_A, l1_B, sum, array_N);
-  //  time_end = mempool_get_timer();
-
-  //  // PARALLEL
-  //  time_init = mempool_get_timer();
-  //  dotp_i32p(l1_A, l1_B, sum, array_N, num_cores);
-  //  time_end = mempool_get_timer();
-
   // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
   dotp_i32p_local_unrolled4(l1_X, l1_Y, sum, array_N);

diff --git a/software/apps/baremetal/matmul_f16/main.c b/software/apps/baremetal/matmul_f16/main.c
@@ -17,7 +17,13 @@
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_matmul_f16.h"
 
-#define PARALLEL
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core matmul.
+PARALLEL: When defined runs parallel matmul.
+*/
 
 __fp16 matrix_a[matrix_M * matrix_N]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));

diff --git a/software/apps/baremetal/matmul_f32/main.c b/software/apps/baremetal/matmul_f32/main.c
@@ -17,8 +17,13 @@
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_matmul_f32.h"
 
-#define PARALLEL
-#define ASM
+/*
+======================
+Parameters and defines
+
+SINGLE: When defined runs single-core matmul.
+PARALLEL: When defined runs parallel matmul.
+*/
 
 float matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
 float matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio")));

diff --git a/software/apps/baremetal/mimo_mmse_f16/main.c b/software/apps/baremetal/mimo_mmse_f16/main.c
@@ -18,25 +18,45 @@
 #include "baremetal/mempool_mimo_mmse_f16s.h"
 
 #include "data_mimo_mmse_f16.h"
-#define ZF (0)   // When asserted use zero-forcing
-#define FOLD (1) // When asserted fold matrices in memory
-#define NUM_BANKS (BANKING_FACTOR * NUM_CORES)
+
+/*
+======================
+Parameters and defines
+
+DOUBLE_BUFFERING: When defined benchmark double buffered MIMO-MMSE, including
+L2-L1 transfers.
+
+For MIMO-MMSE without L2-L1 transfers:
+PARALLEL: When defined benchmark parallel MIMO-MMSE.
+SINGLE: When defined benchmark single-core MIMO-MMSE.
+VEC: When defined benchmark SIMD-vectorized kernels.
+ZF: When defined 1 use zero forcing detector.
+FOLD: When defined 1 fold matrices in memory.
+
+For MIMO-MMSE with L2-L1 transfers:
+DMA_TRANSFER1: When defined transfer inputs for next round at the beginning of
+computation. DMA_TRANSFER2: When defined transfer inputs for next round after
+Hermitian computation. N_ROUNDS: Define number of rounds of Double-Buffering.
+*/
+
+#define ZF (0)
+#define FOLD (1)
 #define PARALLEL
 #define VEC
 
+#ifndef DOUBLE_BUFFERING
+
 /**********************************************************
  **********************************************************
-  _   _  ___        _     _ _____                     __
- | \ | |/ _ \      | |   / |_   _| __ __ _ _ __  ___ / _|
- |  \| | | | |_____| |   | | | || '__/ _` | '_ \/ __| |_
- | |\  | |_| |_____| |___| | | || | | (_| | | | \__ \  _|
- |_| \_|\___/      |_____|_| |_||_|  \__,_|_| |_|___/_|(_)
+  _   _  ___        _____                     __
+ | \ | |/ _ \      |_   _| __ __ _ _ __  ___ / _|
+ |  \| | | | |_____  | || '__/ _` | '_ \/ __| |_
+ | |\  | |_| |_____  | || | | (_| | | | \__ \  _|
+ |_| \_|\___/        |_||_|  \__,_|_| |_|___/_|(_)
 
 ***********************************************************
 ***********************************************************/
 
-#ifndef DOUBLE_BUFFERING
-
 #if FOLD
 #define NUM_ROW (1 + ((N_ITR * N_TX - 1) / NUM_BANKS))
 #define NUM_COL (NUM_BANKS / N_TX)
@@ -193,6 +213,8 @@ int main() {
   return 0;
 }
 
+#else
+
 /**********************************************************
  **********************************************************
   ____  __  __    _       _____                     __
@@ -204,10 +226,6 @@ int main() {
 ***********************************************************
 ***********************************************************/
 
-#else
-#define N_ROUNDS (1)
-#define DMA_TRANSFER1
-
 // Inputs-Outputs even double-buffering rounds
 __fp16 l1A_H[2 * N_TX * N_RX * N_ITR]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));