From dea872f6fb96294d2d402046a88a6dbb38c97b56 Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Thu, 19 Dec 2024 11:52:17 +0100 Subject: [PATCH] [software] Add explanation for the use of defines --- software/apps/baremetal/Makefile | 19 +-- software/apps/baremetal/axpy_f16/main.c | 1 - software/apps/baremetal/axpy_f32/main.c | 1 - .../apps/baremetal/cfft_radix2_q16/main.c | 1 - .../apps/baremetal/cfft_radix4_f16/main.c | 59 ++++---- .../apps/baremetal/cfft_radix4_q16/main.c | 59 ++++---- software/apps/baremetal/chest_f16/main.c | 8 ++ software/apps/baremetal/chest_q16/main.c | 8 ++ software/apps/baremetal/cholesky_f16/main.c | 9 ++ software/apps/baremetal/cholesky_q32/main.c | 27 ++-- software/apps/baremetal/cmatmul_f16/main.c | 15 +- software/apps/baremetal/cmatmul_q16/main.c | 8 ++ software/apps/baremetal/dotp_f16/main.c | 15 -- software/apps/baremetal/dotp_f32/main.c | 13 -- software/apps/baremetal/dotp_i32/main.c | 15 -- software/apps/baremetal/matmul_f16/main.c | 8 +- software/apps/baremetal/matmul_f32/main.c | 9 +- software/apps/baremetal/mimo_mmse_f16/main.c | 46 +++++-- software/apps/baremetal/mimo_mmse_f32/main.c | 11 ++ software/apps/baremetal/mimo_mmse_f8/main.c | 13 +- software/apps/baremetal/mimo_mmse_q16/main.c | 8 +- software/apps/baremetal/ofdm_f16/main.c | 29 ++-- .../kernels/baremetal/mempool_chest_q16.h | 2 +- .../kernels/baremetal/mempool_cholesky_f16s.h | 7 +- .../kernels/baremetal/mempool_cholesky_f32s.h | 5 +- .../kernels/baremetal/mempool_cholesky_q16s.h | 1 - .../kernels/baremetal/mempool_cholesky_q32p.h | 93 +++++++------ .../kernels/baremetal/mempool_cholesky_q32s.h | 2 +- .../kernels/baremetal/mempool_cmatmul_f16.h | 8 +- .../kernels/baremetal/mempool_cmatmul_q16.h | 3 +- software/kernels/baremetal/mempool_dotp_f16.h | 10 ++ software/kernels/baremetal/mempool_dotp_f32.h | 10 ++ software/kernels/baremetal/mempool_dotp_i32.h | 12 ++ .../baremetal/mempool_linearsolver_f16s.h | 6 +- .../baremetal/mempool_linearsolver_f32s.h | 4 +- .../baremetal/mempool_linearsolver_q32p.h | 128 +++++++++--------- .../baremetal/mempool_linearsolver_q32s.h | 10 +- .../kernels/baremetal/mempool_matmul_f32.h | 2 + .../baremetal/mempool_mimo_mmse_f16s.h | 9 +- .../baremetal/mempool_mimo_mmse_f32p.h | 3 +- .../mempool_radix4_cfft_butterfly_f16.h | 16 +-- .../mempool_radix4_cfft_butterfly_q16.h | 16 +-- .../baremetal/mempool_radix4_cfft_f16p.h | 33 +++-- .../baremetal/mempool_radix4_cfft_q16p.h | 50 +++---- software/runtime/runtime.mk | 3 +- 45 files changed, 452 insertions(+), 363 deletions(-) diff --git a/software/apps/baremetal/Makefile b/software/apps/baremetal/Makefile index 9511f7869..bb640dfde 100644 --- a/software/apps/baremetal/Makefile +++ b/software/apps/baremetal/Makefile @@ -20,18 +20,13 @@ APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find $(APPS_DIR) -name "main.c BINARIES := $(addprefix $(BIN_DIR)/,$(APPS)) ALL := $(APPS) -FP_APPS := axpy_f16 axpy_f32 -FP_APPS += cfft_radix4_f16 chest_f16 cholesky_f16 -FP_APPS += cmatmul_f16 matmul_f16 matmul_f32 -FP_APPS += dotp_f16 dotp_f32 -FP_APPS += mimo_mmse_f32 mimo_mmse_f16 mimo_mmse_f8 ofdm_f16 - -I_APPS := synth_i32 -I_APPS += cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32 -I_APPS += cmatmul_q16 mimo_mmse_q16 - -ALL_GCC := $(filter-out $(FP_APPS), $(ALL)) -ALL_LLVM := $(filter-out $(I_APPS), $(ALL)) +FP_SUFFIXES := f16 f32 f8 +I_SUFFIXES := q16 q32 i16 i32 i8 +I_APPS := $(foreach suf, $(FP_SUFFIXES), $(filter %_$(suf), $(ALL))) +FP_APPS := $(foreach suf, $(I_SUFFIXES), $(filter %_$(suf), $(ALL))) +# Filter out applications +ALL_GCC := $(filter-out $(I_APPS), $(ALL)) +ALL_LLVM := $(filter-out $(FP_APPS), $(ALL)) # Make all applications all: $(ALL_GCC) diff --git a/software/apps/baremetal/axpy_f16/main.c b/software/apps/baremetal/axpy_f16/main.c index 1795e9059..8bcb38296 100644 --- a/software/apps/baremetal/axpy_f16/main.c +++ b/software/apps/baremetal/axpy_f16/main.c @@ -15,7 +15,6 @@ #include "synchronization.h" #include "data_axpy_f16.h" -#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) // Vectors for kernel computation __fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); diff --git a/software/apps/baremetal/axpy_f32/main.c b/software/apps/baremetal/axpy_f32/main.c index 34ead109c..cb3f1d8a9 100644 --- a/software/apps/baremetal/axpy_f32/main.c +++ b/software/apps/baremetal/axpy_f32/main.c @@ -15,7 +15,6 @@ #include "synchronization.h" #include "data_axpy_f32.h" -#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) // Vectors for kernel computation float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); diff --git a/software/apps/baremetal/cfft_radix2_q16/main.c b/software/apps/baremetal/cfft_radix2_q16/main.c index e23fb929e..25510b184 100644 --- a/software/apps/baremetal/cfft_radix2_q16/main.c +++ b/software/apps/baremetal/cfft_radix2_q16/main.c @@ -19,7 +19,6 @@ #include "synchronization.h" #include "data_cfft_radix2_q16.h" -#define N_BANKS (NUM_CORES * BANKING_FACTOR) /* CFFT mempool libraries */ #include "baremetal/mempool_cfft_q16_bitreversal.h" diff --git a/software/apps/baremetal/cfft_radix4_f16/main.c b/software/apps/baremetal/cfft_radix4_f16/main.c index b06ae3189..518c06add 100644 --- a/software/apps/baremetal/cfft_radix4_f16/main.c +++ b/software/apps/baremetal/cfft_radix4_f16/main.c @@ -19,25 +19,30 @@ /* CFFT data libraries */ #include "data_cfft_radix4_f16.h" -#define N_BANKS (NUM_CORES * BANKING_FACTOR) -#define MAX_COL (N_BANKS / (N_CSAMPLES / 4)) +#define MAX_COL (NUM_BANKS / (N_CSAMPLES / 4)) -/* CHOOSE ONE */ -#define PARALLEL // Parallel FFT not "memory-aware". -// #define FOLDED // Parallel FFT with "memory-aware" load/store. -//#define SCHEDULED // Folded FFTs arranged in rows and cols.''' +/* +====================== +Parameters and defines -// Bitreversal index from table. +PARALLEL: When defined runs parallel FFT. +FOLDED: When defined runs parallel FFT with folded inputs in memory. +SCHEDULED: When defined runs multiple parallel folded-inputs FFTs. +N_FFTs_ROW: When the FFT is scheduled defines the number of FFTs run sequntially +by each core N_FFTs_COL: + +BITREVERSETABLE: When defined bitreversal indeces are fetched from a table, else +they are computed by cores. FOLDED_TWIDDLES: When FOLDED is defined it can be +defined to also fold the twiddle factors in memory. +*/ + +#define PARALLEL #define BITREVERSETABLE -// Also the twiddles have "memory-aware" load/stores. -// #define FOLDED_TWIDDLES -// Independent FFTs scheduled on one row (default 1). -#define N_FFTs_ROW 1 -// Independent FFTs scheduled on columns (default 1). -#define N_FFTs_COL 1 +#define N_FFTs_ROW (1) +#define N_FFTs_COL (1) #if (N_FFTs_COL > MAX_COL) -#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)] +#error Parallelization not supporting N_FFTs_COL > [NUM_BANKS / (N_CSAMPLES / 4)] #endif #include "baremetal/mempool_cfft_q16_bitreversal.h" @@ -59,16 +64,16 @@ uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH] #endif #if (defined(SCHEDULED) || defined(FOLDED)) -__fp16 l1_pSrc[N_FFTs_ROW * 8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -__fp16 l1_pDst[N_FFTs_ROW * 8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -__fp16 l1_twiddleCoef_f16_src[8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -__fp16 l1_twiddleCoef_f16_dst[8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +__fp16 l1_pSrc[N_FFTs_ROW * 8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +__fp16 l1_pDst[N_FFTs_ROW * 8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +__fp16 l1_twiddleCoef_f16_src[8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +__fp16 l1_twiddleCoef_f16_dst[8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); #endif int main() { @@ -96,7 +101,7 @@ int main() { if (core_id == 0) { for (uint32_t j = 0; j < N_FFTs_ROW; j++) { for (uint32_t i = 0; i < N_FFTs_COL; i++) { - dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS), + dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * NUM_BANKS), l2_pSrc, N_CSAMPLES * sizeof(int32_t)); } } @@ -113,9 +118,11 @@ int main() { for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) { *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL)] = *(v2h *)&l2_twiddleCoef_f16[2 * i]; - *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] = + *(v2h *)&l1_twiddleCoef_f16_src[2 * + (i + j * N_WORDS_COL + 1 * NUM_BANKS)] = *(v2h *)&l2_twiddleCoef_f16[2 * (i * 2U)]; - *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] = + *(v2h *)&l1_twiddleCoef_f16_src[2 * + (i + j * N_WORDS_COL + 2 * NUM_BANKS)] = *(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)]; } } diff --git a/software/apps/baremetal/cfft_radix4_q16/main.c b/software/apps/baremetal/cfft_radix4_q16/main.c index 08ed80e9b..bebb66059 100644 --- a/software/apps/baremetal/cfft_radix4_q16/main.c +++ b/software/apps/baremetal/cfft_radix4_q16/main.c @@ -19,23 +19,30 @@ /* CFFT data libraries */ #include "data_cfft_radix4_q16.h" -#define N_BANKS (NUM_CORES * BANKING_FACTOR) -#define MAX_COL (N_BANKS / (N_CSAMPLES / 4)) +#define MAX_COL (NUM_BANKS / (N_CSAMPLES / 4)) -/* CHOOSE ONE */ -//#define SINGLE // Single core FFT. -//#define PARALLEL // Parallel FFT not "memory-aware". -//#define FOLDED // Parallel FFT with "memory-aware" load/store. -#define SCHEDULED // Folded FFTs arranged in rows and cols.''' +/* +====================== +Parameters and defines + +PARALLEL: When defined runs parallel FFT. +FOLDED: When defined runs parallel FFT with folded inputs in memory. +SCHEDULED: When defined runs multiple parallel folded-inputs FFTs. +N_FFTs_ROW: When the FFT is scheduled defines the number of FFTs run sequntially +by each core N_FFTs_COL: + +BITREVERSETABLE: When defined bitreversal indeces are fetched from a table, else +they are computed by cores. FOLDED_TWIDDLES: When FOLDED is defined it can be +defined to also fold the twiddle factors in memory. +*/ -// Bitreversal index from table. +#define PARALLEL #define BITREVERSETABLE -// Independent FFTs scheduled on one row (default 1). -#define N_FFTs_ROW 2 -// Independent FFTs scheduled on columns (default 1). -#define N_FFTs_COL 2 + +#define N_FFTs_ROW (1) +#define N_FFTs_COL (1) #if (N_FFTs_COL > MAX_COL) -#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)] +#error Parallelization not supporting N_FFTs_COL > [NUM_BANKS / (N_CSAMPLES / 4)] #endif // Also the twiddles have "memory-aware" load/stores. #define FOLDED_TWIDDLES @@ -60,16 +67,16 @@ uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH] #endif #if (defined(SCHEDULED) || defined(FOLDED)) -int16_t l1_pSrc[N_FFTs_ROW * 8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -int16_t l1_pDst[N_FFTs_ROW * 8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -int16_t l1_twiddleCoef_q16_src[8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -int16_t l1_twiddleCoef_q16_dst[8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +int16_t l1_pSrc[N_FFTs_ROW * 8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +int16_t l1_pDst[N_FFTs_ROW * 8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +int16_t l1_twiddleCoef_q16_src[8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +int16_t l1_twiddleCoef_q16_dst[8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); #endif int main() { @@ -97,7 +104,7 @@ int main() { if (core_id == 0) { for (uint32_t j = 0; j < N_FFTs_ROW; j++) { for (uint32_t i = 0; i < N_FFTs_COL; i++) { - dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS), + dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * NUM_BANKS), l2_pSrc, N_CSAMPLES * sizeof(int32_t)); } } @@ -112,9 +119,11 @@ int main() { for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) { *(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL)] = *(v2s *)&l2_twiddleCoef_q16[2 * i]; - *(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] = + *(v2s *)&l1_twiddleCoef_q16_src[2 * + (i + j * N_WORDS_COL + 1 * NUM_BANKS)] = *(v2s *)&l2_twiddleCoef_q16[2 * (i * 2U)]; - *(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] = + *(v2s *)&l1_twiddleCoef_q16_src[2 * + (i + j * N_WORDS_COL + 2 * NUM_BANKS)] = *(v2s *)&l2_twiddleCoef_q16[2 * (i * 3U)]; } } diff --git a/software/apps/baremetal/chest_f16/main.c b/software/apps/baremetal/chest_f16/main.c index e0feb90c7..304313788 100644 --- a/software/apps/baremetal/chest_f16/main.c +++ b/software/apps/baremetal/chest_f16/main.c @@ -19,6 +19,14 @@ #include "baremetal/mempool_chest_f16.h" #include "data_chest_f16.h" +/* +====================== +Parameters and defines + +SINGLE: When defined runs single-core Channel Estimation. +PARALLEL: When defined runs parallel Channel Estimation. +*/ + //#define SINGLE #define PARALLEL diff --git a/software/apps/baremetal/chest_q16/main.c b/software/apps/baremetal/chest_q16/main.c index 572b12de0..6f7a73938 100644 --- a/software/apps/baremetal/chest_q16/main.c +++ b/software/apps/baremetal/chest_q16/main.c @@ -19,6 +19,14 @@ #include "baremetal/mempool_checks.h" #include "baremetal/mempool_chest_q16.h" +/* +====================== +Parameters and defines + +SINGLE: When defined runs single-core Channel Estimation. +PARALLEL: When defined runs parallel Channel Estimation. +*/ + #define PARALLEL int16_t l1_PilotTX[2 * N_TX * N_SAMPLES] diff --git a/software/apps/baremetal/cholesky_f16/main.c b/software/apps/baremetal/cholesky_f16/main.c index 6d1c26ff2..10baa6a81 100644 --- a/software/apps/baremetal/cholesky_f16/main.c +++ b/software/apps/baremetal/cholesky_f16/main.c @@ -17,6 +17,15 @@ #include "baremetal/mempool_checks.h" #include "baremetal/mempool_cholesky_f16s.h" +/* +====================== +Parameters and defines + +SINGLE: When defined runs single-core Cholesky Decomposition. +PARALLEL: When defined runs parallel Cholesky Decomposition. +FOLDED: When defined 1 intermediate results are folded in memory. +*/ + #define SINGLE #define FOLDED (0) diff --git a/software/apps/baremetal/cholesky_q32/main.c b/software/apps/baremetal/cholesky_q32/main.c index 64fbf3b2f..161d17b30 100644 --- a/software/apps/baremetal/cholesky_q32/main.c +++ b/software/apps/baremetal/cholesky_q32/main.c @@ -11,7 +11,6 @@ #include "synchronization.h" #define HALF (1023) -#define N_BANKS (NUM_CORES * BANKING_FACTOR) #define FIX_DIV(a, b) ((int32_t)((a << FIXED_POINT) / b)) #define FIX_MUL(a, b) ((int32_t)((a * b + HALF) >> FIXED_POINT)) #define ABS(a) (a > 0 ? a : -a) @@ -31,18 +30,19 @@ #define N_COL 1 #define N_ROW 1 int32_t l1_A[matrix_N * matrix_N] - __attribute__((aligned(N_BANKS), section(".l1"))); + __attribute__((aligned(NUM_BANKS), section(".l1"))); int32_t l1_L[matrix_N * matrix_N] - __attribute__((aligned(N_BANKS), section(".l1"))); -int32_t l1_y[matrix_N] __attribute__((aligned(N_BANKS), section(".l1"))); + __attribute__((aligned(NUM_BANKS), section(".l1"))); +int32_t l1_y[matrix_N] __attribute__((aligned(NUM_BANKS), section(".l1"))); #else -int32_t l1_AA[matrix_N * N_BANKS] - __attribute__((aligned(N_BANKS), section(".l1_prio"))); -int32_t l1_LL[N_ROW * matrix_N * N_BANKS] - __attribute__((aligned(N_BANKS), section(".l1_prio"))); -int32_t l1_LR[N_ROW * matrix_N * N_BANKS] - __attribute__((aligned(N_BANKS), section(".l1_prio"))); -int32_t l1_yy[N_BANKS] __attribute__((aligned(N_BANKS), section(".l1_prio"))); +int32_t l1_AA[matrix_N * NUM_BANKS] + __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +int32_t l1_LL[N_ROW * matrix_N * NUM_BANKS] + __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +int32_t l1_LR[N_ROW * matrix_N * NUM_BANKS] + __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +int32_t l1_yy[NUM_BANKS] + __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); #endif int main() { @@ -58,11 +58,12 @@ int main() { for (uint32_t idx_col = 0; idx_col < N_COL; idx_col++) { l1_yy[idx_col * matrix_N + i] = l2_y[i]; for (uint32_t j = 0; j < matrix_N; j++) { - l1_AA[idx_col * matrix_N + i * N_BANKS + j] = l2_A[i * matrix_N + j]; + l1_AA[idx_col * matrix_N + i * NUM_BANKS + j] = + l2_A[i * matrix_N + j]; } } } - for (uint32_t i = 0; i < N_ROW * matrix_N * N_BANKS; i++) { + for (uint32_t i = 0; i < N_ROW * matrix_N * NUM_BANKS; i++) { l1_LL[i] = 0; l1_LR[i] = 0; } diff --git a/software/apps/baremetal/cmatmul_f16/main.c b/software/apps/baremetal/cmatmul_f16/main.c index aa2ed55a6..727dba7ca 100644 --- a/software/apps/baremetal/cmatmul_f16/main.c +++ b/software/apps/baremetal/cmatmul_f16/main.c @@ -19,7 +19,18 @@ #include "baremetal/mempool_checks.h" #include "baremetal/mempool_cmatmul_f16.h" -#define PARALLEL_4x4 + +/* +====================== +Parameters and defines + +SINGLE_2x2: Single-core matmul on 2x2 tiles. +PARALLEL_2x2: Parallel matmul on 2x2 C-tiles. +PARALLEL_2x4: Parallel matmul on 4x4 C-tiles. +PARALLEL_4x4: Parallel matmul on 4x4 C-tiles. +PARALLEL_4x4_COPIES_A: Parallel matmul on 4x4 C-tiles, compies of A in memory to +avoid banking conflicts. +*/ #if defined(PARALLEL_4x4_COPIES_A) __fp16 matrix_a[2 * (BANKING_FACTOR * NUM_CORES)] @@ -51,7 +62,7 @@ int main() { // Wait at barrier until everyone is ready mempool_barrier(num_cores); -#if defined(SINGLE_CORE) +#if defined(SINGLE_2x2) // Execute function to test. if (core_id == 0) { mempool_start_benchmark(); diff --git a/software/apps/baremetal/cmatmul_q16/main.c b/software/apps/baremetal/cmatmul_q16/main.c index 0dcffbfc7..37089fd5b 100644 --- a/software/apps/baremetal/cmatmul_q16/main.c +++ b/software/apps/baremetal/cmatmul_q16/main.c @@ -16,6 +16,14 @@ #include "baremetal/mempool_cmatmul_q16.h" #include "data_cmatmul_q16.h" +/* +====================== +Parameters and defines + +SINGLE: When defined runs single-core matmul. +PARALLEL: When defined runs parallel matmul. +*/ + #define PARALLEL #define dim_M (matrix_M) #define dim_N (matrix_N) diff --git a/software/apps/baremetal/dotp_f16/main.c b/software/apps/baremetal/dotp_f16/main.c index 2091f0336..3b8b272b9 100644 --- a/software/apps/baremetal/dotp_f16/main.c +++ b/software/apps/baremetal/dotp_f16/main.c @@ -14,9 +14,6 @@ #include "synchronization.h" #include "data_dotp_f16.h" -#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) -// #define SINGLE_CORE_REDUCTION -#define BINARY_REDUCTION // Vectors for kernel computation __fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); @@ -47,18 +44,6 @@ int main() { } mempool_barrier(num_cores); - // // SINGLE-CORE - // time_init = mempool_get_timer(); - // dotp_f16s(l1_X, l1_Y, sum, array_N); - // // dotp_f16s_unrolled4(l1_X, l1_Y, sum, array_N); - // time_end = mempool_get_timer(); - - // // PARALLEL - // time_init = mempool_get_timer(); - // dotp_f16vecp_unrolled4(l1_X, l1_Y, sum, array_N, num_cores); - // // dotp_f16p(l1_X, l1_Y, sum, array_N, num_cores); - // time_end = mempool_get_timer(); - // PARALLEL, LOCAL ACCESSES time_init = mempool_get_timer(); dotp_f16vecp_local_unrolled4(l1_X, l1_Y, sum, array_N); diff --git a/software/apps/baremetal/dotp_f32/main.c b/software/apps/baremetal/dotp_f32/main.c index 3507795b1..e1a87b6b8 100644 --- a/software/apps/baremetal/dotp_f32/main.c +++ b/software/apps/baremetal/dotp_f32/main.c @@ -15,9 +15,6 @@ #include "synchronization.h" #include "data_dotp_f32.h" -#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) -// #define SINGLE_CORE_REDUCTION -#define BINARY_REDUCTION // Vectors for kernel computation float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); @@ -47,16 +44,6 @@ int main() { } mempool_barrier(num_cores); - // // SINGLE-CORE - // time_init = mempool_get_timer(); - // dotp_f32s_unrolled4(l1_A, l1_B, sum, array_N); - // time_end = mempool_get_timer(); - - // // PARALLEL - // time_init = mempool_get_timer(); - // dotp_f32p(l1_A, l1_B, sum, array_N, num_cores); - // time_end = mempool_get_timer(); - // PARALLEL, LOCAL ACCESSES time_init = mempool_get_timer(); dotp_f32p_local_unrolled4(l1_X, l1_Y, sum, array_N); diff --git a/software/apps/baremetal/dotp_i32/main.c b/software/apps/baremetal/dotp_i32/main.c index ee2e2ea52..8f6490ee2 100644 --- a/software/apps/baremetal/dotp_i32/main.c +++ b/software/apps/baremetal/dotp_i32/main.c @@ -15,11 +15,6 @@ #include "synchronization.h" #include "data_dotp_i32.h" -#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) -#define LOG_BARRIERS -// #define ATOMIC_REDUCTION -// #define SINGLE_CORE_REDUCTION -#define BINARY_REDUCTION // Vectors for kernel computation int32_t l1_X[array_N] __attribute__((aligned(array_N), section(".l1_prio"))); @@ -49,16 +44,6 @@ int main() { } mempool_barrier(num_cores); - // // SINGLE-CORE - // time_init = mempool_get_timer(); - // dotp_i32s_unrolled4(l1_A, l1_B, sum, array_N); - // time_end = mempool_get_timer(); - - // // PARALLEL - // time_init = mempool_get_timer(); - // dotp_i32p(l1_A, l1_B, sum, array_N, num_cores); - // time_end = mempool_get_timer(); - // PARALLEL, LOCAL ACCESSES time_init = mempool_get_timer(); dotp_i32p_local_unrolled4(l1_X, l1_Y, sum, array_N); diff --git a/software/apps/baremetal/matmul_f16/main.c b/software/apps/baremetal/matmul_f16/main.c index 99a0269cc..9964257ca 100644 --- a/software/apps/baremetal/matmul_f16/main.c +++ b/software/apps/baremetal/matmul_f16/main.c @@ -17,7 +17,13 @@ #include "baremetal/mempool_checks.h" #include "baremetal/mempool_matmul_f16.h" -#define PARALLEL +/* +====================== +Parameters and defines + +SINGLE: When defined runs single-core matmul. +PARALLEL: When defined runs parallel matmul. +*/ __fp16 matrix_a[matrix_M * matrix_N] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); diff --git a/software/apps/baremetal/matmul_f32/main.c b/software/apps/baremetal/matmul_f32/main.c index d3d7622db..ba9165ed1 100644 --- a/software/apps/baremetal/matmul_f32/main.c +++ b/software/apps/baremetal/matmul_f32/main.c @@ -17,8 +17,13 @@ #include "baremetal/mempool_checks.h" #include "baremetal/mempool_matmul_f32.h" -#define PARALLEL -#define ASM +/* +====================== +Parameters and defines + +SINGLE: When defined runs single-core matmul. +PARALLEL: When defined runs parallel matmul. +*/ float matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio"))); float matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio"))); diff --git a/software/apps/baremetal/mimo_mmse_f16/main.c b/software/apps/baremetal/mimo_mmse_f16/main.c index 80309a1e0..b1ef24451 100644 --- a/software/apps/baremetal/mimo_mmse_f16/main.c +++ b/software/apps/baremetal/mimo_mmse_f16/main.c @@ -18,25 +18,45 @@ #include "baremetal/mempool_mimo_mmse_f16s.h" #include "data_mimo_mmse_f16.h" -#define ZF (0) // When asserted use zero-forcing -#define FOLD (1) // When asserted fold matrices in memory -#define NUM_BANKS (BANKING_FACTOR * NUM_CORES) + +/* +====================== +Parameters and defines + +DOUBLE_BUFFERING: When defined benchmark double buffered MIMO-MMSE, including +L2-L1 transfers. + +For MIMO-MMSE without L2-L1 transfers: +PARALLEL: When defined benchmark parallel MIMO-MMSE. +SINGLE: When defined benchmark single-core MIMO-MMSE. +VEC: When defined benchmark SIMD-vectorized kernels. +ZF: When defined 1 use zero forcing detector. +FOLD: When defined 1 fold matrices in memory. + +For MIMO-MMSE with L2-L1 transfers: +DMA_TRANSFER1: When defined transfer inputs for next round at the beginning of +computation. DMA_TRANSFER2: When defined transfer inputs for next round after +Hermitian computation. N_ROUNDS: Define number of rounds of Double-Buffering. +*/ + +#define ZF (0) +#define FOLD (1) #define PARALLEL #define VEC +#ifndef DOUBLE_BUFFERING + /********************************************************** ********************************************************** - _ _ ___ _ _ _____ __ - | \ | |/ _ \ | | / |_ _| __ __ _ _ __ ___ / _| - | \| | | | |_____| | | | | || '__/ _` | '_ \/ __| |_ - | |\ | |_| |_____| |___| | | || | | (_| | | | \__ \ _| - |_| \_|\___/ |_____|_| |_||_| \__,_|_| |_|___/_|(_) + _ _ ___ _____ __ + | \ | |/ _ \ |_ _| __ __ _ _ __ ___ / _| + | \| | | | |_____ | || '__/ _` | '_ \/ __| |_ + | |\ | |_| |_____ | || | | (_| | | | \__ \ _| + |_| \_|\___/ |_||_| \__,_|_| |_|___/_|(_) *********************************************************** ***********************************************************/ -#ifndef DOUBLE_BUFFERING - #if FOLD #define NUM_ROW (1 + ((N_ITR * N_TX - 1) / NUM_BANKS)) #define NUM_COL (NUM_BANKS / N_TX) @@ -193,6 +213,8 @@ int main() { return 0; } +#else + /********************************************************** ********************************************************** ____ __ __ _ _____ __ @@ -204,10 +226,6 @@ int main() { *********************************************************** ***********************************************************/ -#else -#define N_ROUNDS (1) -#define DMA_TRANSFER1 - // Inputs-Outputs even double-buffering rounds __fp16 l1A_H[2 * N_TX * N_RX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); diff --git a/software/apps/baremetal/mimo_mmse_f32/main.c b/software/apps/baremetal/mimo_mmse_f32/main.c index d243754fc..fb054e4e0 100644 --- a/software/apps/baremetal/mimo_mmse_f32/main.c +++ b/software/apps/baremetal/mimo_mmse_f32/main.c @@ -21,6 +21,17 @@ #include "data_mimo_mmse_f32.h" +/* +====================== +Parameters and defines + +PARALLEL: When defined benchmark parallel MIMO-MMSE. +SINGLE: When defined benchmark single-core MIMO-MMSE. +PARALLEL_HERMITIAN: When defined the Hermitian is finely-grained parallelized +over a group of cores. ZF: When defined 1 use zero forcing detector. FOLD: When +defined 1 fold matrices in memory. +*/ + #define SINGLE #define ZF (0) #define FOLD (0) diff --git a/software/apps/baremetal/mimo_mmse_f8/main.c b/software/apps/baremetal/mimo_mmse_f8/main.c index c5d1cd77e..006dbf83b 100644 --- a/software/apps/baremetal/mimo_mmse_f8/main.c +++ b/software/apps/baremetal/mimo_mmse_f8/main.c @@ -18,9 +18,20 @@ #include "baremetal/mempool_mimo_mmse_f8s.h" #include "data_mimo_mmse_f8.h" + +/* +====================== +Parameters and defines + +PARALLEL: When defined benchmark parallel MIMO-MMSE. +SINGLE: When defined benchmark single-core MIMO-MMSE. +VEC: When defined benchmark SIMD-vectorized kernels. +ZF: When defined 1 use zero forcing detector. +FOLD: When defined 1 fold matrices in memory. +*/ + #define ZF (0) // When asserted use zero-forcing #define FOLD (0) // When asserted fold matrixes in memory -#define NUM_BANKS (BANKING_FACTOR * NUM_CORES) #define PARALLEL #define VEC diff --git a/software/apps/baremetal/mimo_mmse_q16/main.c b/software/apps/baremetal/mimo_mmse_q16/main.c index 9bcb5e9db..8e2b557a4 100644 --- a/software/apps/baremetal/mimo_mmse_q16/main.c +++ b/software/apps/baremetal/mimo_mmse_q16/main.c @@ -16,7 +16,13 @@ #include "baremetal/mempool_linearsolver_q16s.h" #include "baremetal/mempool_mimo_mmse_q16s.h" -#define PARALLEL +/* +====================== +Parameters and defines + +PARALLEL: When defined benchmark parallel MIMO-MMSE. +SINGLE: When defined benchmark single-core MIMO-MMSE. +*/ int16_t l1_H[2 * N_TX * N_RX * N_ITR] __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), diff --git a/software/apps/baremetal/ofdm_f16/main.c b/software/apps/baremetal/ofdm_f16/main.c index 264768199..3cf04dbed 100644 --- a/software/apps/baremetal/ofdm_f16/main.c +++ b/software/apps/baremetal/ofdm_f16/main.c @@ -18,7 +18,6 @@ #include "synchronization.h" #include "data_ofdm_f16.h" -#define N_BANKS (NUM_CORES * BANKING_FACTOR) // CFFT Parameters #define SCHEDULED @@ -28,7 +27,7 @@ #define N_FFTs_COL 4 #define N_FFTs_ROW (N_RX / N_FFTs_COL) // CMATMUL Parameters -#define NUM_COPIES (N_BANKS / (N_BEAMS * N_RX)) +#define NUM_COPIES (NUM_BANKS / (N_BEAMS * N_RX)) #define dim_M (N_BEAMS) #define dim_N (N_RX) #define dim_P (N_SC) @@ -43,18 +42,18 @@ dump(checkpoint, 1); uint32_t arrival_index __attribute__((section(".l1_prio"))); __fp16 l1_pBF_Coef_folded[2 * BANKING_FACTOR * NUM_CORES] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); -__fp16 l1_pFFT_Src[N_FFTs_ROW * 8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -__fp16 l1_pFFT_Dst[N_FFTs_ROW * 8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -__fp16 l1_twiddleCoef_f16_src[6 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -__fp16 l1_twiddleCoef_f16_dst[6 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +__fp16 l1_pFFT_Src[N_FFTs_ROW * 8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +__fp16 l1_pFFT_Dst[N_FFTs_ROW * 8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +__fp16 l1_twiddleCoef_f16_src[6 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +__fp16 l1_twiddleCoef_f16_dst[6 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); /////////////////////////////////////////////////////////////////////////////////////////////////// /* MAIN */ @@ -67,7 +66,7 @@ int main() { mempool_start_benchmark(); if (core_id == 0) { // Each FFT is folded over 4 memory rows - // Each memory row is 2 * N_BANKS samples + // Each memory row is 2 * NUM_BANKS samples __atomic_store_n(&arrival_index, 0, __ATOMIC_RELAXED); dma_memcpy_blocking(l1_pFFT_Src, l2_pFFT_Src, (N_RX * N_SC) * sizeof(int32_t)); @@ -78,7 +77,7 @@ int main() { dim_M * dim_N * sizeof(int32_t)); } for (uint32_t i = 0; i < N_FFTs_COL; i++) { - dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * N_BANKS), + dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * NUM_BANKS), l2_twiddleCoef_f16, 3 * (N_SC / 4) * sizeof(int32_t)); } } @@ -114,7 +113,7 @@ int main() { dma_memcpy_blocking(l2_pBF_Dst, l1_pFFT_Dst, (N_RX * N_SC) * sizeof(int32_t)); for (uint32_t i = 0; i < N_FFTs_COL; i++) { - dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * N_BANKS), + dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * NUM_BANKS), l2_twiddleCoef_f16, 3 * (N_SC / 4) * sizeof(int32_t)); } __atomic_store_n(&arrival_index, 0, __ATOMIC_RELAXED); diff --git a/software/kernels/baremetal/mempool_chest_q16.h b/software/kernels/baremetal/mempool_chest_q16.h index b4d90adff..6e735bbe7 100644 --- a/software/kernels/baremetal/mempool_chest_q16.h +++ b/software/kernels/baremetal/mempool_chest_q16.h @@ -6,7 +6,7 @@ #pragma once #include "builtins_v2.h" -#define __MUL +#define __MUL // Multiplication by pilot instead of division. /* a[i] = ar[i] + i * ai[j] out[i][j] = a[i] / c[j] diff --git a/software/kernels/baremetal/mempool_cholesky_f16s.h b/software/kernels/baremetal/mempool_cholesky_f16s.h index 3b42bdb80..1d7b67e36 100644 --- a/software/kernels/baremetal/mempool_cholesky_f16s.h +++ b/software/kernels/baremetal/mempool_cholesky_f16s.h @@ -7,7 +7,6 @@ #pragma once #include "builtins_v2.h" -#define N_BANKS (NUM_CORES * BANKING_FACTOR) #ifdef __XDIVSQRT @@ -29,7 +28,7 @@ void mempool_cholesky_f16s(__fp16 *pSrc, __fp16 *pL, const uint32_t n, __fp16 ap, bp; // Pivot element __fp16 as, bs; // Sum element uint32_t i, j, k; - const uint32_t offset = folded ? N_BANKS : n; + const uint32_t offset = folded ? NUM_BANKS : n; for (j = 0; j < n; j++) { // Elements on diagonal (input matrix is positive-definite) @@ -103,7 +102,7 @@ void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n, v2h apbp, dgdg; v2h ab, cd; uint32_t i, j, k; - const uint32_t offset = folded ? N_BANKS : n; + const uint32_t offset = folded ? NUM_BANKS : n; for (j = 0; j < n; j++) { @@ -383,7 +382,7 @@ void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n, v2h ab, cd, ndc; uint32_t i, j, k; - const uint32_t offset = folded ? N_BANKS : n; + const uint32_t offset = folded ? NUM_BANKS : n; for (j = 0; j < n; j++) { diff --git a/software/kernels/baremetal/mempool_cholesky_f32s.h b/software/kernels/baremetal/mempool_cholesky_f32s.h index 63fd878dc..135d00fcd 100644 --- a/software/kernels/baremetal/mempool_cholesky_f32s.h +++ b/software/kernels/baremetal/mempool_cholesky_f32s.h @@ -4,9 +4,6 @@ // Author: Marco Bertuletti, ETH Zurich -#pragma once -#define N_BANKS (NUM_CORES * BANKING_FACTOR) - #ifdef __XDIVSQRT /** @@ -26,7 +23,7 @@ void mempool_cholesky_f32s(float *pSrc, float *pL, const uint32_t n, float ap, bp; // Pivot element float as, bs; // Sum element uint32_t i, j, k; - const uint32_t offset = folded ? N_BANKS : n; + const uint32_t offset = folded ? NUM_BANKS : n; for (j = 0; j < n; j++) { diff --git a/software/kernels/baremetal/mempool_cholesky_q16s.h b/software/kernels/baremetal/mempool_cholesky_q16s.h index dc20a2b94..fe7c2bd8a 100644 --- a/software/kernels/baremetal/mempool_cholesky_q16s.h +++ b/software/kernels/baremetal/mempool_cholesky_q16s.h @@ -7,7 +7,6 @@ #pragma once #include "baremetal/mempool_sqrt_q32s.h" #include "builtins_v2.h" -#define N_BANKS (NUM_CORES * BANKING_FACTOR) /** VECTORIZED CODE @brief Cholesky decomposition with Crout algorithm. diff --git a/software/kernels/baremetal/mempool_cholesky_q32p.h b/software/kernels/baremetal/mempool_cholesky_q32p.h index 88819e842..ec40172f5 100644 --- a/software/kernels/baremetal/mempool_cholesky_q32p.h +++ b/software/kernels/baremetal/mempool_cholesky_q32p.h @@ -325,13 +325,13 @@ void mempool_cholesky_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, const uint32_t n, uint32_t matrix_row = (FoldLeft == 1) ? j : (n - 1 - j); /* Elements on the diagonal are computed with a single core */ if (core_id == core_idx) { - pivot = pSrc[j * N_BANKS + j]; + pivot = pSrc[j * NUM_BANKS + j]; sum = 0; for (k = 0; k < 4 * (j >> 2U); k++) { - a0 = pL[matrix_row + k * N_BANKS]; - a1 = pL[matrix_row + (k + 1) * N_BANKS]; - a2 = pL[matrix_row + (k + 2) * N_BANKS]; - a3 = pL[matrix_row + (k + 3) * N_BANKS]; + a0 = pL[matrix_row + k * NUM_BANKS]; + a1 = pL[matrix_row + (k + 1) * NUM_BANKS]; + a2 = pL[matrix_row + (k + 2) * NUM_BANKS]; + a3 = pL[matrix_row + (k + 3) * NUM_BANKS]; asm volatile("mul %[a0],%[a0],%[a0];" "mul %[a1],%[a1],%[a1];" "mul %[a2],%[a2],%[a2];" @@ -355,9 +355,9 @@ void mempool_cholesky_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, const uint32_t n, } switch (j % 4) { case 3: - a0 = pL[matrix_row + k * N_BANKS]; - a1 = pL[matrix_row + (k + 1) * N_BANKS]; - a2 = pL[matrix_row + (k + 2) * N_BANKS]; + a0 = pL[matrix_row + k * NUM_BANKS]; + a1 = pL[matrix_row + (k + 1) * NUM_BANKS]; + a2 = pL[matrix_row + (k + 2) * NUM_BANKS]; asm volatile( "mul %[a0],%[a0],%[a0];" "mul %[a1],%[a1],%[a1];" @@ -376,8 +376,8 @@ void mempool_cholesky_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, const uint32_t n, :); break; case 2: - a0 = pL[matrix_row + k * N_BANKS]; - a1 = pL[matrix_row + (k + 1) * N_BANKS]; + a0 = pL[matrix_row + k * NUM_BANKS]; + a1 = pL[matrix_row + (k + 1) * NUM_BANKS]; asm volatile("mul %[a0],%[a0],%[a0];" "mul %[a1],%[a1],%[a1];" "addi %[a0],%[a0],%[h];" @@ -391,7 +391,7 @@ void mempool_cholesky_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, const uint32_t n, :); break; case 1: - a0 = pL[matrix_row + k * N_BANKS]; + a0 = pL[matrix_row + k * NUM_BANKS]; asm volatile("mul %[a0],%[a0],%[a0];" "addi %[a0],%[a0],%[h];" "srai %[a0],%[a0],%[s];" @@ -403,7 +403,8 @@ void mempool_cholesky_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, const uint32_t n, case 0: break; } - pL[matrix_row + j * N_BANKS] = mempool_sqrt_q32s(pivot - sum, FIXED_POINT); + pL[matrix_row + j * NUM_BANKS] = + mempool_sqrt_q32s(pivot - sum, FIXED_POINT); } return; } @@ -427,17 +428,17 @@ void mempool_cholesky_q32p_divisum(int32_t *pSrc, int32_t *pL, uint32_t core_id, if (core_id == core_idx) { sum = 0; - pivot = pSrc[i * N_BANKS + j]; - diag = pL[jmatrix_row + j * N_BANKS]; + pivot = pSrc[i * NUM_BANKS + j]; + diag = pL[jmatrix_row + j * NUM_BANKS]; for (k = 0; k < 4 * (j >> 2U); k += 4) { - a0 = pL[imatrix_row + k * N_BANKS]; - a1 = pL[imatrix_row + (k + 1) * N_BANKS]; - a2 = pL[imatrix_row + (k + 2) * N_BANKS]; - a3 = pL[imatrix_row + (k + 3) * N_BANKS]; - b0 = pL[jmatrix_row + k * N_BANKS]; - b1 = pL[jmatrix_row + (k + 1) * N_BANKS]; - b2 = pL[jmatrix_row + (k + 2) * N_BANKS]; - b3 = pL[jmatrix_row + (k + 3) * N_BANKS]; + a0 = pL[imatrix_row + k * NUM_BANKS]; + a1 = pL[imatrix_row + (k + 1) * NUM_BANKS]; + a2 = pL[imatrix_row + (k + 2) * NUM_BANKS]; + a3 = pL[imatrix_row + (k + 3) * NUM_BANKS]; + b0 = pL[jmatrix_row + k * NUM_BANKS]; + b1 = pL[jmatrix_row + (k + 1) * NUM_BANKS]; + b2 = pL[jmatrix_row + (k + 2) * NUM_BANKS]; + b3 = pL[jmatrix_row + (k + 3) * NUM_BANKS]; asm volatile("mul %[a0],%[a0],%[b0];" "mul %[a1],%[a1],%[b1];" "mul %[a2],%[a2],%[b2];" @@ -462,12 +463,12 @@ void mempool_cholesky_q32p_divisum(int32_t *pSrc, int32_t *pL, uint32_t core_id, } switch (j % 4) { case 3: - a0 = pL[imatrix_row + k * N_BANKS]; - a1 = pL[imatrix_row + (k + 1) * N_BANKS]; - a2 = pL[imatrix_row + (k + 2) * N_BANKS]; - b0 = pL[jmatrix_row + k * N_BANKS]; - b1 = pL[jmatrix_row + (k + 1) * N_BANKS]; - b2 = pL[jmatrix_row + (k + 2) * N_BANKS]; + a0 = pL[imatrix_row + k * NUM_BANKS]; + a1 = pL[imatrix_row + (k + 1) * NUM_BANKS]; + a2 = pL[imatrix_row + (k + 2) * NUM_BANKS]; + b0 = pL[jmatrix_row + k * NUM_BANKS]; + b1 = pL[jmatrix_row + (k + 1) * NUM_BANKS]; + b2 = pL[jmatrix_row + (k + 2) * NUM_BANKS]; asm volatile( "mul %[a0],%[a0],%[b0];" "mul %[a1],%[a1],%[b1];" @@ -487,10 +488,10 @@ void mempool_cholesky_q32p_divisum(int32_t *pSrc, int32_t *pL, uint32_t core_id, :); break; case 2: - a0 = pL[imatrix_row + k * N_BANKS]; - a1 = pL[imatrix_row + (k + 1) * N_BANKS]; - b0 = pL[jmatrix_row + k * N_BANKS]; - b1 = pL[jmatrix_row + (k + 1) * N_BANKS]; + a0 = pL[imatrix_row + k * NUM_BANKS]; + a1 = pL[imatrix_row + (k + 1) * NUM_BANKS]; + b0 = pL[jmatrix_row + k * NUM_BANKS]; + b1 = pL[jmatrix_row + (k + 1) * NUM_BANKS]; asm volatile( "mul %[a0],%[a0],%[b0];" "mul %[a1],%[a1],%[b1];" @@ -505,8 +506,8 @@ void mempool_cholesky_q32p_divisum(int32_t *pSrc, int32_t *pL, uint32_t core_id, :); break; case 1: - a0 = pL[imatrix_row + k * N_BANKS]; - b0 = pL[jmatrix_row + k * N_BANKS]; + a0 = pL[imatrix_row + k * NUM_BANKS]; + b0 = pL[jmatrix_row + k * NUM_BANKS]; asm volatile("mul %[a0],%[a0],%[b0];" "addi %[a0],%[a0],%[h];" "srai %[a0],%[a0],%[s];" @@ -518,7 +519,7 @@ void mempool_cholesky_q32p_divisum(int32_t *pSrc, int32_t *pL, uint32_t core_id, case 0: break; } - pL[imatrix_row + j * N_BANKS] = FIX_DIV((pivot - sum), diag); + pL[imatrix_row + j * NUM_BANKS] = FIX_DIV((pivot - sum), diag); } } return; @@ -557,23 +558,25 @@ void mempool_cholesky_fold_schedule_q32p(int32_t *pSrcA, int32_t *pSrcB, for (j = 0; j < n; j++) { for (idx_col = column_id; idx_col < n_col; idx_col += n_col) { for (idx_row = 0; idx_row < n_row; idx_row++) { - mempool_cholesky_q32p_sqrtsum( - pSrcA + column_id * n, pLL + idx_col * n + idx_row * (n * N_BANKS), - core_id, n, j, 1); // FoldLeft - mempool_cholesky_q32p_sqrtsum( - pSrcB + column_id * n, pLR + idx_col * n + idx_row * (n * N_BANKS), - core_id, n, j, 0); // FoldRight + mempool_cholesky_q32p_sqrtsum(pSrcA + column_id * n, + pLL + idx_col * n + + idx_row * (n * NUM_BANKS), + core_id, n, j, 1); // FoldLeft + mempool_cholesky_q32p_sqrtsum(pSrcB + column_id * n, + pLR + idx_col * n + + idx_row * (n * NUM_BANKS), + core_id, n, j, 0); // FoldRight } } mempool_log_partial_barrier(2, absolute_core_id, n_col * (n >> 2U)); for (idx_col = column_id; idx_col < n_col; idx_col += n_col) { for (idx_row = 0; idx_row < n_row; idx_row++) { mempool_cholesky_q32p_divisum( - pSrcA + column_id * n, pLL + idx_col * n + idx_row * (n * N_BANKS), - core_id, n, j, 1); + pSrcA + column_id * n, + pLL + idx_col * n + idx_row * (n * NUM_BANKS), core_id, n, j, 1); mempool_cholesky_q32p_divisum( - pSrcB + column_id * n, pLR + idx_col * n + idx_row * (n * N_BANKS), - core_id, n, j, 0); + pSrcB + column_id * n, + pLR + idx_col * n + idx_row * (n * NUM_BANKS), core_id, n, j, 0); } } mempool_log_partial_barrier(2, absolute_core_id, n_col * (n >> 2U)); diff --git a/software/kernels/baremetal/mempool_cholesky_q32s.h b/software/kernels/baremetal/mempool_cholesky_q32s.h index c7a5a60c7..5ce497e96 100644 --- a/software/kernels/baremetal/mempool_cholesky_q32s.h +++ b/software/kernels/baremetal/mempool_cholesky_q32s.h @@ -320,7 +320,7 @@ void mempool_cholesky_schedule_q32s(int32_t *pSrc, int32_t *pL, uint32_t idx_row, idx_col = core_id; for (idx_row = 0; idx_row < n_row; idx_row++) { mempool_cholesky_crout_q32s(pSrc + idx_col * n, - pL + idx_col * n + idx_row * N_BANKS, n); + pL + idx_col * n + idx_row * NUM_BANKS, n); } mempool_log_partial_barrier(2, core_id, n_col * (n >> 2U)); } diff --git a/software/kernels/baremetal/mempool_cmatmul_f16.h b/software/kernels/baremetal/mempool_cmatmul_f16.h index 12645c454..374144f64 100644 --- a/software/kernels/baremetal/mempool_cmatmul_f16.h +++ b/software/kernels/baremetal/mempool_cmatmul_f16.h @@ -13,10 +13,8 @@ #pragma once #include "builtins_v2.h" -// Use complex dotp in a single offload -#define __CDOTP -// Shift cores startpoint over rows of matrix A -#define __SHIFT_A +#define __CDOTP // Use complex dotp in a single offload +#define __SHIFT_A // Shift cores startpoint over rows of matrix A /****************************************************************************** __ ___ _ _ ____ _ @@ -559,7 +557,7 @@ void cmatmul_4x4_f16p(__fp16 const *__restrict__ A, return; } -// 4x4 MATMUL with copies of A matrix (for M*N < N_BANKS) +// 4x4 MATMUL with copies of A matrix (for M*N < NUM_BANKS) void cmatmul_4x4_f16p_copy_A(__fp16 const *__restrict__ A_l2, __fp16 *__restrict__ A_l1, __fp16 const *__restrict__ B, diff --git a/software/kernels/baremetal/mempool_cmatmul_q16.h b/software/kernels/baremetal/mempool_cmatmul_q16.h index aa6a71b6c..78ed04b31 100644 --- a/software/kernels/baremetal/mempool_cmatmul_q16.h +++ b/software/kernels/baremetal/mempool_cmatmul_q16.h @@ -13,8 +13,7 @@ #pragma once #include "builtins_v2.h" -// Shift cores startpoint over rows of matrix A -#define __SHIFT_A +#define __SHIFT_A // Shift cores startpoint over rows of matrix A #define CMATMUL_1x1_LOOP \ v2s sum = {0, 0}; \ diff --git a/software/kernels/baremetal/mempool_dotp_f16.h b/software/kernels/baremetal/mempool_dotp_f16.h index 791b7c68e..e8083cfcf 100644 --- a/software/kernels/baremetal/mempool_dotp_f16.h +++ b/software/kernels/baremetal/mempool_dotp_f16.h @@ -7,6 +7,16 @@ #pragma once #include "builtins_v2.h" +/* +====================== +Parameters and defines + +SINGLE_CORE_REDUCTION: Reduction with a single-core. +BINARY_REDUCTION: Reduction with binary tree. +*/ + +#define SINGLE_CORE_REDUCTION + #define DOTPF16VEC_UNROLLED4_LOOP \ { \ a01 = (*(v2h *)&in_a[i]); \ diff --git a/software/kernels/baremetal/mempool_dotp_f32.h b/software/kernels/baremetal/mempool_dotp_f32.h index 290b96d59..13e4ff9e5 100644 --- a/software/kernels/baremetal/mempool_dotp_f32.h +++ b/software/kernels/baremetal/mempool_dotp_f32.h @@ -4,6 +4,16 @@ // Author: Marco Bertuletti, ETH Zurich +/* +====================== +Parameters and defines + +SINGLE_CORE_REDUCTION: Reduction with a single-core. +BINARY_REDUCTION: Reduction with binary tree. +*/ + +#define SINGLE_CORE_REDUCTION + #define DOTPF32_UNROLLED4_LOOP \ { \ a0 = in_a[i]; \ diff --git a/software/kernels/baremetal/mempool_dotp_i32.h b/software/kernels/baremetal/mempool_dotp_i32.h index 4b80e92ed..3f8320b91 100644 --- a/software/kernels/baremetal/mempool_dotp_i32.h +++ b/software/kernels/baremetal/mempool_dotp_i32.h @@ -4,6 +4,18 @@ // Author: Marco Bertuletti, ETH Zurich +/* +====================== +Parameters and defines + +SINGLE_CORE_REDUCTION: Reduction with a single-core. +BINARY_REDUCTION: Reduction with binary tree. +ATOMIC_REDUCTION: Reduction with atomics. +LOG_BARRIERS: Use binary reduction +*/ + +#define SINGLE_CORE_REDUCTION + #define DOTPI32_UNROLLED4_LOOP \ { \ a0 = in_a[i]; \ diff --git a/software/kernels/baremetal/mempool_linearsolver_f16s.h b/software/kernels/baremetal/mempool_linearsolver_f16s.h index c4e134527..919fd3585 100644 --- a/software/kernels/baremetal/mempool_linearsolver_f16s.h +++ b/software/kernels/baremetal/mempool_linearsolver_f16s.h @@ -5,8 +5,6 @@ // Author: Marco Bertuletti, ETH Zurich #pragma once -#define N_BANKS (NUM_CORES * BANKING_FACTOR) - #ifdef __XDIVSQRT /** @@ -30,7 +28,7 @@ void mempool_Ltrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, const uint32_t n, __fp16 as, bs; __fp16 ax, bx; __fp16 diag; - const uint32_t offset = folded ? N_BANKS : n; + const uint32_t offset = folded ? NUM_BANKS : n; // Solve for each variable x_i in turn for (i = 0; i < n; i++) { @@ -98,7 +96,7 @@ void mempool_Ltrisol_f16s(__fp16 *pL, __fp16 *in, __fp16 *x, const uint32_t n, __fp16 as, bs; __fp16 diag; - const uint32_t offset = folded ? N_BANKS : n; + const uint32_t offset = folded ? NUM_BANKS : n; float ax, bx, diag_f32; v2h res; diff --git a/software/kernels/baremetal/mempool_linearsolver_f32s.h b/software/kernels/baremetal/mempool_linearsolver_f32s.h index d3297397c..02f38b698 100644 --- a/software/kernels/baremetal/mempool_linearsolver_f32s.h +++ b/software/kernels/baremetal/mempool_linearsolver_f32s.h @@ -5,8 +5,6 @@ // Author: Marco Bertuletti, ETH Zurich #pragma once -#define N_BANKS (NUM_CORES * BANKING_FACTOR) - #ifdef __XDIVSQRT /** @@ -29,7 +27,7 @@ void mempool_Ltrisol_f32s(float *pL, float *in, float *x, const uint32_t n, float as, bs; float ax, bx; float diag; - const uint32_t offset = folded ? N_BANKS : n; + const uint32_t offset = folded ? NUM_BANKS : n; // Solve for each variable x_i in turn for (i = 0; i < n; i++) { diff --git a/software/kernels/baremetal/mempool_linearsolver_q32p.h b/software/kernels/baremetal/mempool_linearsolver_q32p.h index 49b629259..5fbc2f230 100644 --- a/software/kernels/baremetal/mempool_linearsolver_q32p.h +++ b/software/kernels/baremetal/mempool_linearsolver_q32p.h @@ -135,13 +135,13 @@ void mempool_linearsolver_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, int32_t *pIn, /* Elements on the diagonal are computed with a single core */ if (core_id == core_idx) { in = pIn[j]; - pivot = pSrc[matrix_row * N_BANKS + j]; + pivot = pSrc[matrix_row * NUM_BANKS + j]; sum = 0; for (k = 0; k < 4 * (j >> 2U); k++) { - a0 = pL[matrix_row + k * N_BANKS]; - a1 = pL[matrix_row + (k + 1) * N_BANKS]; - a2 = pL[matrix_row + (k + 2) * N_BANKS]; - a3 = pL[matrix_row + (k + 3) * N_BANKS]; + a0 = pL[matrix_row + k * NUM_BANKS]; + a1 = pL[matrix_row + (k + 1) * NUM_BANKS]; + a2 = pL[matrix_row + (k + 2) * NUM_BANKS]; + a3 = pL[matrix_row + (k + 3) * NUM_BANKS]; asm volatile("mul %[a0],%[a0],%[a0];" "mul %[a1],%[a1],%[a1];" "mul %[a2],%[a2],%[a2];" @@ -165,9 +165,9 @@ void mempool_linearsolver_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, int32_t *pIn, } switch (j % 4) { case 3: - a0 = pL[matrix_row + k * N_BANKS]; - a1 = pL[matrix_row + (k + 1) * N_BANKS]; - a2 = pL[matrix_row + (k + 2) * N_BANKS]; + a0 = pL[matrix_row + k * NUM_BANKS]; + a1 = pL[matrix_row + (k + 1) * NUM_BANKS]; + a2 = pL[matrix_row + (k + 2) * NUM_BANKS]; asm volatile( "mul %[a0],%[a0],%[a0];" "mul %[a1],%[a1],%[a1];" @@ -186,8 +186,8 @@ void mempool_linearsolver_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, int32_t *pIn, :); break; case 2: - a0 = pL[matrix_row + k * N_BANKS]; - a1 = pL[matrix_row + (k + 1) * N_BANKS]; + a0 = pL[matrix_row + k * NUM_BANKS]; + a1 = pL[matrix_row + (k + 1) * NUM_BANKS]; asm volatile("mul %[a0],%[a0],%[a0];" "mul %[a1],%[a1],%[a1];" "addi %[a0],%[a0],%[h];" @@ -201,7 +201,7 @@ void mempool_linearsolver_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, int32_t *pIn, :); break; case 1: - a0 = pL[matrix_row + k * N_BANKS]; + a0 = pL[matrix_row + k * NUM_BANKS]; asm volatile("mul %[a0],%[a0],%[a0];" "addi %[a0],%[a0],%[h];" "srai %[a0],%[a0],%[s];" @@ -215,7 +215,7 @@ void mempool_linearsolver_q32p_sqrtsum(int32_t *pSrc, int32_t *pL, int32_t *pIn, } result = mempool_sqrt_q32s(pivot - sum, FIXED_POINT); pIn[j] = FIX_DIV(in, result); - pL[matrix_row + j * N_BANKS] = result; + pL[matrix_row + j * NUM_BANKS] = result; } } @@ -238,19 +238,19 @@ void mempool_linearsolver_q32p_divisum(int32_t *pSrc, int32_t *pL, int32_t *pIn, if (core_id == core_idx) { sum = 0; - pivot = pSrc[i * N_BANKS + j]; - diag = pL[jmatrix_row + j * N_BANKS]; + pivot = pSrc[i * NUM_BANKS + j]; + diag = pL[jmatrix_row + j * NUM_BANKS]; in = pIn[j]; sum_r = pIn[i]; for (k = 0; k < 4 * (j >> 2U); k += 4) { - a0 = pL[imatrix_row + k * N_BANKS]; - a1 = pL[imatrix_row + (k + 1) * N_BANKS]; - a2 = pL[imatrix_row + (k + 2) * N_BANKS]; - a3 = pL[imatrix_row + (k + 3) * N_BANKS]; - b0 = pL[jmatrix_row + k * N_BANKS]; - b1 = pL[jmatrix_row + (k + 1) * N_BANKS]; - b2 = pL[jmatrix_row + (k + 2) * N_BANKS]; - b3 = pL[jmatrix_row + (k + 3) * N_BANKS]; + a0 = pL[imatrix_row + k * NUM_BANKS]; + a1 = pL[imatrix_row + (k + 1) * NUM_BANKS]; + a2 = pL[imatrix_row + (k + 2) * NUM_BANKS]; + a3 = pL[imatrix_row + (k + 3) * NUM_BANKS]; + b0 = pL[jmatrix_row + k * NUM_BANKS]; + b1 = pL[jmatrix_row + (k + 1) * NUM_BANKS]; + b2 = pL[jmatrix_row + (k + 2) * NUM_BANKS]; + b3 = pL[jmatrix_row + (k + 3) * NUM_BANKS]; asm volatile("mul %[a0],%[a0],%[b0];" "mul %[a1],%[a1],%[b1];" "mul %[a2],%[a2],%[b2];" @@ -275,12 +275,12 @@ void mempool_linearsolver_q32p_divisum(int32_t *pSrc, int32_t *pL, int32_t *pIn, } switch (j % 4) { case 3: - a0 = pL[imatrix_row + k * N_BANKS]; - a1 = pL[imatrix_row + (k + 1) * N_BANKS]; - a2 = pL[imatrix_row + (k + 2) * N_BANKS]; - b0 = pL[jmatrix_row + k * N_BANKS]; - b1 = pL[jmatrix_row + (k + 1) * N_BANKS]; - b2 = pL[jmatrix_row + (k + 2) * N_BANKS]; + a0 = pL[imatrix_row + k * NUM_BANKS]; + a1 = pL[imatrix_row + (k + 1) * NUM_BANKS]; + a2 = pL[imatrix_row + (k + 2) * NUM_BANKS]; + b0 = pL[jmatrix_row + k * NUM_BANKS]; + b1 = pL[jmatrix_row + (k + 1) * NUM_BANKS]; + b2 = pL[jmatrix_row + (k + 2) * NUM_BANKS]; asm volatile( "mul %[a0],%[a0],%[b0];" "mul %[a1],%[a1],%[b1];" @@ -300,10 +300,10 @@ void mempool_linearsolver_q32p_divisum(int32_t *pSrc, int32_t *pL, int32_t *pIn, :); break; case 2: - a0 = pL[imatrix_row + k * N_BANKS]; - a1 = pL[imatrix_row + (k + 1) * N_BANKS]; - b0 = pL[jmatrix_row + k * N_BANKS]; - b1 = pL[jmatrix_row + (k + 1) * N_BANKS]; + a0 = pL[imatrix_row + k * NUM_BANKS]; + a1 = pL[imatrix_row + (k + 1) * NUM_BANKS]; + b0 = pL[jmatrix_row + k * NUM_BANKS]; + b1 = pL[jmatrix_row + (k + 1) * NUM_BANKS]; asm volatile( "mul %[a0],%[a0],%[b0];" "mul %[a1],%[a1],%[b1];" @@ -318,8 +318,8 @@ void mempool_linearsolver_q32p_divisum(int32_t *pSrc, int32_t *pL, int32_t *pIn, :); break; case 1: - a0 = pL[imatrix_row + k * N_BANKS]; - b0 = pL[jmatrix_row + k * N_BANKS]; + a0 = pL[imatrix_row + k * NUM_BANKS]; + b0 = pL[jmatrix_row + k * NUM_BANKS]; asm volatile("mul %[a0],%[a0],%[b0];" "addi %[a0],%[a0],%[h];" "srai %[a0],%[a0],%[s];" @@ -333,7 +333,7 @@ void mempool_linearsolver_q32p_divisum(int32_t *pSrc, int32_t *pL, int32_t *pIn, } result = FIX_DIV((pivot - sum), diag); pIn[i] = sum_r - result * in; - pL[imatrix_row + j * N_BANKS] = result; + pL[imatrix_row + j * NUM_BANKS] = result; } } } @@ -353,10 +353,10 @@ void mempool_linearsolver_q32p_trisolverL(int32_t *pL, int32_t *pIn, a1 = pIn[k - 1]; a2 = pIn[k - 2]; a3 = pIn[k - 3]; - b0 = pL[k + i * N_BANKS]; - b1 = pL[(k - 1) + i * N_BANKS]; - b2 = pL[(k - 2) + i * N_BANKS]; - b3 = pL[(k - 3) + i * N_BANKS]; + b0 = pL[k + i * NUM_BANKS]; + b1 = pL[(k - 1) + i * NUM_BANKS]; + b2 = pL[(k - 2) + i * NUM_BANKS]; + b3 = pL[(k - 3) + i * NUM_BANKS]; asm volatile("mul %[a0],%[a0],%[b0];" "mul %[a1],%[a1],%[b1];" "mul %[a2],%[a2],%[b2];" @@ -384,9 +384,9 @@ void mempool_linearsolver_q32p_trisolverL(int32_t *pL, int32_t *pIn, a0 = pIn[k]; a1 = pIn[k - 1]; a2 = pIn[k - 2]; - b0 = pL[k + i * N_BANKS]; - b1 = pL[(k - 1) + i * N_BANKS]; - b2 = pL[(k - 2) + i * N_BANKS]; + b0 = pL[k + i * NUM_BANKS]; + b1 = pL[(k - 1) + i * NUM_BANKS]; + b2 = pL[(k - 2) + i * NUM_BANKS]; asm volatile( "mul %[a0],%[a0],%[b0];" "mul %[a1],%[a1],%[b1];" @@ -408,8 +408,8 @@ void mempool_linearsolver_q32p_trisolverL(int32_t *pL, int32_t *pIn, case 2: a0 = pIn[k]; a1 = pIn[k - 1]; - b0 = pL[k + i * N_BANKS]; - b1 = pL[(k - 1) + i * N_BANKS]; + b0 = pL[k + i * NUM_BANKS]; + b1 = pL[(k - 1) + i * NUM_BANKS]; asm volatile( "mul %[a0],%[a0],%[b0];" "mul %[a1],%[a1],%[b1];" @@ -425,7 +425,7 @@ void mempool_linearsolver_q32p_trisolverL(int32_t *pL, int32_t *pIn, break; case 3: a0 = pIn[k]; - b0 = pL[k + i * N_BANKS]; + b0 = pL[k + i * NUM_BANKS]; asm volatile("mul %[a0],%[a0],%[b0];" "addi %[a0],%[a0],%[h];" "srai %[a0],%[a0],%[s];" @@ -437,7 +437,7 @@ void mempool_linearsolver_q32p_trisolverL(int32_t *pL, int32_t *pIn, case 0: break; } - pIn[i] = FIX_DIV(sum, pL[i * N_BANKS + i]); + pIn[i] = FIX_DIV(sum, pL[i * NUM_BANKS + i]); } } } @@ -457,10 +457,10 @@ void mempool_linearsolver_q32p_trisolverR(int32_t *pL, int32_t *pIn, a1 = pIn[n - 1 - k - 1]; a2 = pIn[n - 1 - k - 2]; a3 = pIn[n - 1 - k - 3]; - b0 = pL[k + i * N_BANKS]; - b1 = pL[(k + 1) + i * N_BANKS]; - b2 = pL[(k + 2) + i * N_BANKS]; - b3 = pL[(k + 3) + i * N_BANKS]; + b0 = pL[k + i * NUM_BANKS]; + b1 = pL[(k + 1) + i * NUM_BANKS]; + b2 = pL[(k + 2) + i * NUM_BANKS]; + b3 = pL[(k + 3) + i * NUM_BANKS]; asm volatile("mul %[a0],%[a0],%[b0];" "mul %[a1],%[a1],%[b1];" "mul %[a2],%[a2],%[b2];" @@ -488,9 +488,9 @@ void mempool_linearsolver_q32p_trisolverR(int32_t *pL, int32_t *pIn, a0 = pIn[n - 1 - k]; a1 = pIn[n - 1 - k - 1]; a2 = pIn[n - 1 - k - 2]; - b0 = pL[k + i * N_BANKS]; - b1 = pL[(k + 1) + i * N_BANKS]; - b2 = pL[(k + 2) + i * N_BANKS]; + b0 = pL[k + i * NUM_BANKS]; + b1 = pL[(k + 1) + i * NUM_BANKS]; + b2 = pL[(k + 2) + i * NUM_BANKS]; asm volatile( "mul %[a0],%[a0],%[b0];" "mul %[a1],%[a1],%[b1];" @@ -512,8 +512,8 @@ void mempool_linearsolver_q32p_trisolverR(int32_t *pL, int32_t *pIn, case 2: a0 = pIn[n - 1 - k]; a1 = pIn[n - 1 - k - 1]; - b0 = pL[k + i * N_BANKS]; - b1 = pL[(k + 1) + i * N_BANKS]; + b0 = pL[k + i * NUM_BANKS]; + b1 = pL[(k + 1) + i * NUM_BANKS]; asm volatile( "mul %[a0],%[a0],%[b0];" "mul %[a1],%[a1],%[b1];" @@ -529,7 +529,7 @@ void mempool_linearsolver_q32p_trisolverR(int32_t *pL, int32_t *pIn, break; case 1: a0 = pIn[n - 1 - k]; - b0 = pL[k + i * N_BANKS]; + b0 = pL[k + i * NUM_BANKS]; asm volatile("mul %[a0],%[a0],%[b0];" "addi %[a0],%[a0],%[h];" "srai %[a0],%[a0],%[s];" @@ -541,7 +541,7 @@ void mempool_linearsolver_q32p_trisolverR(int32_t *pL, int32_t *pIn, case 0: break; } - pIn[i] = FIX_DIV(sum, pL[i * N_BANKS + i]); + pIn[i] = FIX_DIV(sum, pL[i * NUM_BANKS + i]); } } } @@ -585,10 +585,10 @@ void mempool_linearsolver_fold_schedule_q32p(int32_t *pSrcA, int32_t *pSrcB, for (idx_col = column_id; idx_col < n_col; idx_col += n_col) { for (idx_row = 0; idx_row < n_row; idx_row++) { mempool_linearsolver_q32p_sqrtsum( - pSrcA + idx_col * n, pLL + idx_col * n + idx_row * (n * N_BANKS), + pSrcA + idx_col * n, pLL + idx_col * n + idx_row * (n * NUM_BANKS), pIn + idx_col * n, core_id, n, j, 1); mempool_linearsolver_q32p_sqrtsum( - pSrcB + idx_col * n, pLR + idx_col * n + idx_row * (n * N_BANKS), + pSrcB + idx_col * n, pLR + idx_col * n + idx_row * (n * NUM_BANKS), pIn + idx_col * n, core_id, n, j, 0); } } @@ -596,10 +596,10 @@ void mempool_linearsolver_fold_schedule_q32p(int32_t *pSrcA, int32_t *pSrcB, for (idx_col = column_id; idx_col < n_col; idx_col += n_col) { for (idx_row = 0; idx_row < n_row; idx_row++) { mempool_linearsolver_q32p_divisum( - pSrcA + idx_col * n, pLL + idx_col * n + idx_row * (n * N_BANKS), - pIn + idx_col * n + idx_row * N_BANKS, core_id, n, j, 1); + pSrcA + idx_col * n, pLL + idx_col * n + idx_row * (n * NUM_BANKS), + pIn + idx_col * n + idx_row * NUM_BANKS, core_id, n, j, 1); mempool_linearsolver_q32p_divisum( - pSrcB + idx_col * n, pLR + idx_col * n + idx_row * (n * N_BANKS), + pSrcB + idx_col * n, pLR + idx_col * n + idx_row * (n * NUM_BANKS), pIn + idx_col * n, core_id, n, j, 0); } } @@ -609,10 +609,10 @@ void mempool_linearsolver_fold_schedule_q32p(int32_t *pSrcA, int32_t *pSrcB, for (idx_col = column_id; idx_col < n_col; idx_col += n_col) { for (idx_row = 0; idx_row < n_row; idx_row++) { mempool_linearsolver_q32p_trisolverL(pLL + idx_col * n + - idx_row * (n * N_BANKS), + idx_row * (n * NUM_BANKS), pIn + idx_col * n, core_id, n); mempool_linearsolver_q32p_trisolverR( - pLR + idx_col * n + idx_row * (n * N_BANKS), pIn + idx_col * n, + pLR + idx_col * n + idx_row * (n * NUM_BANKS), pIn + idx_col * n, core_id, n, n_col * (n >> 2U)); } } diff --git a/software/kernels/baremetal/mempool_linearsolver_q32s.h b/software/kernels/baremetal/mempool_linearsolver_q32s.h index d03ec273f..848c0fdeb 100644 --- a/software/kernels/baremetal/mempool_linearsolver_q32s.h +++ b/software/kernels/baremetal/mempool_linearsolver_q32s.h @@ -24,7 +24,7 @@ void mempool_lowtrisolver_q32s(int32_t *pL, int32_t *pIn, const uint32_t n, int32_t in0, in1, in2, in3; int32_t l0, l1, l2, l3; - uint32_t OFFSET = (folded == 1) ? N_BANKS : n; + uint32_t OFFSET = (folded == 1) ? NUM_BANKS : n; for (i = 0; i < n; i++) { sum = pIn[i]; @@ -140,7 +140,7 @@ void mempool_uprtrisolver_q32s(int32_t *pL, int32_t volatile *pIn, int32_t in0, in1, in2, in3; int32_t l0, l1, l2, l3; - uint32_t OFFSET = (folded == 1) ? N_BANKS : n; + uint32_t OFFSET = (folded == 1) ? NUM_BANKS : n; for (i = n - 1; i < n; i--) { sum = pIn[i]; @@ -266,7 +266,7 @@ void mempool_linearsolver_q32s(int32_t *pSrc, int32_t *pL, int32_t a0, a1, a2, a3; int32_t b0, b1, b2, b3; - uint32_t OFFSET = (folded == 1) ? N_BANKS : n; + uint32_t OFFSET = (folded == 1) ? NUM_BANKS : n; for (j = 0; j < n; j++) { in = pIn[j]; @@ -483,9 +483,9 @@ void mempool_linearsolver_schedule_q32s(int32_t *pSrc, int32_t *pL, uint32_t idx_row, idx_col = core_id; for (idx_row = 0; idx_row < n_row; idx_row++) { mempool_linearsolver_q32s(pSrc + idx_col * n, - pL + idx_col * n + idx_row * N_BANKS, + pL + idx_col * n + idx_row * NUM_BANKS, pIn + idx_col * n, n, 1); - mempool_uprtrisolver_q32s(pL + idx_col * n + idx_row * N_BANKS, + mempool_uprtrisolver_q32s(pL + idx_col * n + idx_row * NUM_BANKS, pIn + idx_col * n, n, 1); } mempool_log_partial_barrier(2, core_id, n_col * (n >> 2U)); diff --git a/software/kernels/baremetal/mempool_matmul_f32.h b/software/kernels/baremetal/mempool_matmul_f32.h index 8879fa52d..c6b669bcc 100644 --- a/software/kernels/baremetal/mempool_matmul_f32.h +++ b/software/kernels/baremetal/mempool_matmul_f32.h @@ -13,6 +13,8 @@ #pragma once #include "builtins_v2.h" +// The 4x4 matmul is executed with asm_volatile statement +#define ASM void matmul_2x2_single_f32(float const *__restrict__ A, float const *__restrict__ B, float *__restrict__ C, diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f16s.h b/software/kernels/baremetal/mempool_mimo_mmse_f16s.h index 91e3aa789..134da8905 100644 --- a/software/kernels/baremetal/mempool_mimo_mmse_f16s.h +++ b/software/kernels/baremetal/mempool_mimo_mmse_f16s.h @@ -7,7 +7,6 @@ #pragma once #include "builtins_v2.h" -#define N_BANKS (NUM_CORES * BANKING_FACTOR) /****************************************************************************** _____ __ _ @@ -112,7 +111,7 @@ void mempool_hermitian_f16s(__fp16 *pH, __fp16 *pG, __fp16 *pS, bs3 = (__fp16)0.0f; } } - uint32_t const offset = folded ? N_BANKS : n_tx; + uint32_t const offset = folded ? NUM_BANKS : n_tx; // Store pG[2 * (i * offset + j)] = as0; pG[2 * (i * offset + j + 1U)] = as1; @@ -285,7 +284,7 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS, asm volatile("fadd.h %0, %0, %1;" : "+&r"(res0) : "r"(pS[2 * i])); } // Store - uint32_t addr = folded ? 2 * (i * N_BANKS + j) : 2 * (i * n_tx + j); + uint32_t addr = folded ? 2 * (i * NUM_BANKS + j) : 2 * (i * n_tx + j); (*(v2h *)&pG[addr]) = res0; } @@ -356,7 +355,7 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS, asm volatile("fadd.h %0, %0, %1;" : "+&r"(res3) : "r"(pS[2 * i])); } } - uint32_t const offset = folded ? N_BANKS : n_tx; + uint32_t const offset = folded ? NUM_BANKS : n_tx; // Store (*(v2h *)&pG[2 * (i * offset + j)]) = res0; (*(v2h *)&pG[2 * (i * offset + j + 1U)]) = res1; @@ -415,7 +414,7 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS, asm volatile("fadd.h %0, %0, %1;" : "+&r"(res3) : "r"(pS[2 * i])); } } - uint32_t const offset = folded ? N_BANKS : n_tx; + uint32_t const offset = folded ? NUM_BANKS : n_tx; // Store (*(v2h *)&pG[2 * (i * offset + j)]) = res0; (*(v2h *)&pG[2 * (i * offset + j + 1U)]) = res1; diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f32p.h b/software/kernels/baremetal/mempool_mimo_mmse_f32p.h index 7e3e6fe1a..13ebcb537 100644 --- a/software/kernels/baremetal/mempool_mimo_mmse_f32p.h +++ b/software/kernels/baremetal/mempool_mimo_mmse_f32p.h @@ -5,7 +5,6 @@ // Author: Marco Bertuletti, ETH Zurich #pragma once -#define N_BANKS (NUM_CORES * BANKING_FACTOR) /** @brief Computes the Hermitian matrix G = (H'*H + pS^2I). @@ -102,7 +101,7 @@ void mempool_hermitian_f32p(float *pH, float *pG, float *pS, bs3 = 0.0f; } } - uint32_t const offset = folded ? N_BANKS : n_tx; + uint32_t const offset = folded ? NUM_BANKS : n_tx; // Store pG[2 * (i * offset + j)] = as0; pG[2 * (i * offset + j + 1U)] = as1; diff --git a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h index 3ce36f3b6..12b2320e5 100644 --- a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h +++ b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h @@ -48,7 +48,7 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut, // STORE INDEXES #if defined(FOLDED) || defined(SCHEDULED) uint32_t n2_store = n2 >> 2U; - i0_store = (i0 % n2_store) + (i0 / n2_store) * N_BANKS; + i0_store = (i0 % n2_store) + (i0 / n2_store) * NUM_BANKS; i1_store = i0_store + n2_store; i2_store = i1_store + n2_store; i3_store = i2_store + n2_store; @@ -158,9 +158,9 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut, /* index calculation for the input as, */ /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + * 3fftLen/4] */ - i1 = i0 + N_BANKS; - i2 = i1 + N_BANKS; - i3 = i2 + N_BANKS; + i1 = i0 + NUM_BANKS; + i2 = i1 + NUM_BANKS; + i3 = i2 + NUM_BANKS; #else /* index calculation for the input as, */ /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + @@ -173,7 +173,7 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut, #if defined(FOLDED) || defined(SCHEDULED) uint32_t n2_store = n2 >> 2U; i0_store = - (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS; + (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * NUM_BANKS; i1_store = i0_store + n2_store; i2_store = i1_store + n2_store; i3_store = i2_store + n2_store; @@ -278,9 +278,9 @@ static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut, /* index calculation for the input as, */ /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ - i1 = i0 + N_BANKS; - i2 = i1 + N_BANKS; - i3 = i2 + N_BANKS; + i1 = i0 + NUM_BANKS; + i2 = i1 + NUM_BANKS; + i3 = i2 + NUM_BANKS; #else /* index calculation for the input as, */ /* pIn[i0 + 0], pIn[i0 + fftLen/4], diff --git a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_q16.h b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_q16.h index a0c1a7791..5db6be1cf 100644 --- a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_q16.h +++ b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_q16.h @@ -50,7 +50,7 @@ static inline void radix4_butterfly_first(int16_t *pIn, int16_t *pOut, // STORE INDEXES #if defined(FOLDED) || defined(SCHEDULED) uint32_t n2_store = n2 >> 2U; - i0_store = (i0 % n2_store) + (i0 / n2_store) * N_BANKS; + i0_store = (i0 % n2_store) + (i0 / n2_store) * NUM_BANKS; i1_store = i0_store + n2_store; i2_store = i1_store + n2_store; i3_store = i2_store + n2_store; @@ -227,9 +227,9 @@ static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut, /* index calculation for the input as, */ /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + * 3fftLen/4] */ - i1 = i0 + N_BANKS; - i2 = i1 + N_BANKS; - i3 = i2 + N_BANKS; + i1 = i0 + NUM_BANKS; + i2 = i1 + NUM_BANKS; + i3 = i2 + NUM_BANKS; #else /* index calculation for the input as, */ /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + @@ -242,7 +242,7 @@ static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut, #if defined(FOLDED) || defined(SCHEDULED) uint32_t n2_store = n2 >> 2U; i0_store = - (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS; + (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * NUM_BANKS; i1_store = i0_store + n2_store; i2_store = i1_store + n2_store; i3_store = i2_store + n2_store; @@ -403,9 +403,9 @@ static inline void radix4_butterfly_last(int16_t *pIn, int16_t *pOut, /* index calculation for the input as, */ /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ - i1 = i0 + N_BANKS; - i2 = i1 + N_BANKS; - i3 = i2 + N_BANKS; + i1 = i0 + NUM_BANKS; + i2 = i1 + NUM_BANKS; + i3 = i2 + NUM_BANKS; #else /* index calculation for the input as, */ /* pIn[i0 + 0], pIn[i0 + fftLen/4], diff --git a/software/kernels/baremetal/mempool_radix4_cfft_f16p.h b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h index c6b4acf6b..90a3fd093 100644 --- a/software/kernels/baremetal/mempool_radix4_cfft_f16p.h +++ b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h @@ -5,7 +5,6 @@ // Author: Marco Bertuletti, ETH Zurich #pragma once -#define BITREVERSETABLE #include "builtins_v2.h" #define MIN(x, y) (((x) < (y)) ? (x) : (y)) @@ -33,19 +32,19 @@ #ifdef FOLDED_TWIDDLES #define LOAD_STORE_TWIDDLEFACT \ CoSi1 = *(v2h *)&pCoef_src[2U * ic]; \ - CoSi2 = *(v2h *)&pCoef_src[2U * (ic + 1 * N_BANKS)]; \ - CoSi3 = *(v2h *)&pCoef_src[2U * (ic + 2 * N_BANKS)]; \ + CoSi2 = *(v2h *)&pCoef_src[2U * (ic + 1 * NUM_BANKS)]; \ + CoSi3 = *(v2h *)&pCoef_src[2U * (ic + 2 * NUM_BANKS)]; \ if (ic % 4 == 0) { \ *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi1; \ *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi1; \ *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi1; \ *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi1; \ - ic_store += N_BANKS; \ + ic_store += NUM_BANKS; \ *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi2; \ *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi2; \ *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi2; \ *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi2; \ - ic_store += N_BANKS; \ + ic_store += NUM_BANKS; \ *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi3; \ *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi3; \ *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi3; \ @@ -315,8 +314,8 @@ void mempool_radix4_cfft_f16p_scheduler( LOAD_STORE_TWIDDLEFACT; SHUFFLE_TWIDDLEFACT; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen; - pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * fftLen; + pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4); radix4_butterfly_first(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, C3); } @@ -345,8 +344,8 @@ void mempool_radix4_cfft_f16p_scheduler( SHUFFLE_TWIDDLEFACT; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); - pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4); + pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4); radix4_butterfly_middle(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, C3); } @@ -370,8 +369,8 @@ void mempool_radix4_cfft_f16p_scheduler( uint32_t col_shift = fftLen / 4; #endif - pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); - pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * col_shift; + pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4); + pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * col_shift; radix4_butterfly_last(pIn, pOut, i0); } } @@ -416,7 +415,7 @@ void mempool_radix4_cfft_f16p_scheduler( : [s2] "r"(s2) :); for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { - uint16_t *ptr = (uint16_t *)(pIn + idx_row * (N_BANKS * 8)); + uint16_t *ptr = (uint16_t *)(pIn + idx_row * (NUM_BANKS * 8)); // Load at address a tmpa1 = *(uint32_t *)&ptr[a1]; tmpa2 = *(uint32_t *)&ptr[a2]; @@ -465,12 +464,12 @@ void mempool_radix4_cfft_f16p_scheduler( idx3 = idx3 >> 1U; } idx0 = ic / 4; - idx1 = ic / 4 + N_BANKS; - idx2 = ic / 4 + 2 * N_BANKS; - idx3 = ic / 4 + 3 * N_BANKS; + idx1 = ic / 4 + NUM_BANKS; + idx2 = ic / 4 + 2 * NUM_BANKS; + idx3 = ic / 4 + 3 * NUM_BANKS; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (N_BANKS * 8); - ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (N_BANKS * 8); + ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (NUM_BANKS * 8); + ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (NUM_BANKS * 8); *((uint32_t *)&ptr2[2 * idx_result0]) = *((uint32_t *)&ptr1[2 * idx0]); *((uint32_t *)&ptr2[2 * idx_result1]) = *((uint32_t *)&ptr1[2 * idx1]); *((uint32_t *)&ptr2[2 * idx_result2]) = *((uint32_t *)&ptr1[2 * idx2]); diff --git a/software/kernels/baremetal/mempool_radix4_cfft_q16p.h b/software/kernels/baremetal/mempool_radix4_cfft_q16p.h index e91602866..ef6122a00 100644 --- a/software/kernels/baremetal/mempool_radix4_cfft_q16p.h +++ b/software/kernels/baremetal/mempool_radix4_cfft_q16p.h @@ -45,19 +45,19 @@ #define LOAD_STORE_TWIDDLEFACT \ CoSi1 = *(v2s *)&pCoef_src[2U * ic]; \ - CoSi2 = *(v2s *)&pCoef_src[2U * (ic + 1 * N_BANKS)]; \ - CoSi3 = *(v2s *)&pCoef_src[2U * (ic + 2 * N_BANKS)]; \ + CoSi2 = *(v2s *)&pCoef_src[2U * (ic + 1 * NUM_BANKS)]; \ + CoSi3 = *(v2s *)&pCoef_src[2U * (ic + 2 * NUM_BANKS)]; \ if (ic % 4 == 0) { \ *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi1; \ *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi1; \ *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi1; \ *((v2s *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi1; \ - ic_store += N_BANKS; \ + ic_store += NUM_BANKS; \ *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi2; \ *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi2; \ *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi2; \ *((v2s *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi2; \ - ic_store += N_BANKS; \ + ic_store += NUM_BANKS; \ *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi3; \ *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi3; \ *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi3; \ @@ -226,16 +226,16 @@ static inline void fold_radix4(int16_t *pSrc16, uint32_t fftLen, i1 = i0 + n2; i2 = i1 + n2; i3 = i2 + n2; - i1_store = i0 + N_BANKS; - i2_store = i1_store + N_BANKS; - i3_store = i2_store + N_BANKS; + i1_store = i0 + NUM_BANKS; + i2_store = i1_store + NUM_BANKS; + i3_store = i2_store + NUM_BANKS; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - A = *(v2s *)&pSrc16[i1 * 2U + idx_row * (8 * N_BANKS)]; - B = *(v2s *)&pSrc16[i2 * 2U + idx_row * (8 * N_BANKS)]; - C = *(v2s *)&pSrc16[i3 * 2U + idx_row * (8 * N_BANKS)]; - *(v2s *)&pSrc16[i1_store * 2U + idx_row * (8 * N_BANKS)] = A; - *(v2s *)&pSrc16[i2_store * 2U + idx_row * (8 * N_BANKS)] = B; - *(v2s *)&pSrc16[i3_store * 2U + idx_row * (8 * N_BANKS)] = C; + A = *(v2s *)&pSrc16[i1 * 2U + idx_row * (8 * NUM_BANKS)]; + B = *(v2s *)&pSrc16[i2 * 2U + idx_row * (8 * NUM_BANKS)]; + C = *(v2s *)&pSrc16[i3 * 2U + idx_row * (8 * NUM_BANKS)]; + *(v2s *)&pSrc16[i1_store * 2U + idx_row * (8 * NUM_BANKS)] = A; + *(v2s *)&pSrc16[i2_store * 2U + idx_row * (8 * NUM_BANKS)] = B; + *(v2s *)&pSrc16[i3_store * 2U + idx_row * (8 * NUM_BANKS)] = C; } } mempool_log_partial_barrier(2, absolute_core_id, nPE); @@ -426,8 +426,8 @@ void mempool_radix4_cfft_q16p_scheduler( LOAD_STORE_TWIDDLEFACT; SHUFFLE_TWIDDLEFACT; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen; - pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * fftLen; + pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4); radix4_butterfly_first(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, C3); } @@ -460,8 +460,8 @@ void mempool_radix4_cfft_q16p_scheduler( SHUFFLE_TWIDDLEFACT; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); - pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4); + pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4); radix4_butterfly_middle(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, C3); } @@ -489,8 +489,8 @@ void mempool_radix4_cfft_q16p_scheduler( uint32_t col_shift = fftLen / 4; #endif - pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); - pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * col_shift; + pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4); + pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * col_shift; radix4_butterfly_last(pIn, pOut, i0); } } @@ -535,7 +535,7 @@ void mempool_radix4_cfft_q16p_scheduler( : [s2] "r"(s2) :); for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { - uint16_t *ptr = (uint16_t *)(pIn + idx_row * (N_BANKS * 8)); + uint16_t *ptr = (uint16_t *)(pIn + idx_row * (NUM_BANKS * 8)); // Load at address a tmpa1 = *(uint32_t *)&ptr[a1]; tmpa2 = *(uint32_t *)&ptr[a2]; @@ -584,12 +584,12 @@ void mempool_radix4_cfft_q16p_scheduler( idx3 = idx3 >> 1U; } idx0 = ic / 4; - idx1 = ic / 4 + N_BANKS; - idx2 = ic / 4 + 2 * N_BANKS; - idx3 = ic / 4 + 3 * N_BANKS; + idx1 = ic / 4 + NUM_BANKS; + idx2 = ic / 4 + 2 * NUM_BANKS; + idx3 = ic / 4 + 3 * NUM_BANKS; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (N_BANKS * 8); - ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (N_BANKS * 8); + ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (NUM_BANKS * 8); + ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (NUM_BANKS * 8); *((uint32_t *)&ptr2[2 * idx_result0]) = *((uint32_t *)&ptr1[2 * idx0]); *((uint32_t *)&ptr2[2 * idx_result1]) = *((uint32_t *)&ptr1[2 * idx1]); *((uint32_t *)&ptr2[2 * idx_result2]) = *((uint32_t *)&ptr1[2 * idx2]); diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk index 2602804cc..94f822ddc 100644 --- a/software/runtime/runtime.mk +++ b/software/runtime/runtime.mk @@ -27,7 +27,7 @@ DATA_DIR ?= $(abspath $(ROOT_DIR)/../data) COMPILER ?= gcc XPULPIMG ?= $(xpulpimg) ZFINX ?= $(zfinx) -XDIVSQRT ?= $(xDivSqrt) +XDIVSQRT ?= $(xDivSqrt) RISCV_XLEN ?= 32 @@ -92,6 +92,7 @@ DEFINES += -DNUM_CORES=$(num_cores) DEFINES += -DNUM_GROUPS=$(num_groups) DEFINES += -DNUM_CORES_PER_TILE=$(num_cores_per_tile) DEFINES += -DBANKING_FACTOR=$(banking_factor) +DEFINES += -DNUM_BANKS=$(shell awk 'BEGIN{print $(banking_factor)*$(num_cores)}') DEFINES += -DNUM_CORES_PER_GROUP=$(shell awk 'BEGIN{print $(num_cores)/$(num_groups)}') DEFINES += -DNUM_TILES_PER_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_cores_per_tile)}') DEFINES += -DLOG2_NUM_CORES_PER_TILE=$(shell awk 'BEGIN{print log($(num_cores_per_tile))/log(2)}')